awacke1 commited on
Commit
688cdbe
Β·
verified Β·
1 Parent(s): 5767b4d

Update backup10.app.py

Browse files
Files changed (1) hide show
  1. backup10.app.py +125 -205
backup10.app.py CHANGED
@@ -1,31 +1,24 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- from sentence_transformers import SentenceTransformer
5
- from sklearn.metrics.pairwise import cosine_similarity
6
- import torch
7
  import json
8
  import os
9
  import glob
10
  import random
11
  from pathlib import Path
12
- from datetime import datetime, timedelta
13
  import edge_tts
14
  import asyncio
15
  import requests
16
- from collections import defaultdict
17
  import streamlit.components.v1 as components
18
- from urllib.parse import quote
19
- from xml.etree import ElementTree as ET
20
- from datasets import load_dataset
21
  import base64
22
  import re
 
 
23
 
24
  # -------------------- Configuration & Constants --------------------
25
  USER_NAMES = [
26
- "Alex", "Jordan", "Taylor", "Morgan", "Rowan", "Avery", "Riley", "Quinn",
27
- "Casey", "Jesse", "Reese", "Skyler", "Ellis", "Devon", "Aubrey", "Kendall",
28
- "Parker", "Dakota", "Sage", "Finley"
29
  ]
30
 
31
  ENGLISH_VOICES = [
@@ -34,9 +27,10 @@ ENGLISH_VOICES = [
34
  "en-CA-LiamNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural"
35
  ]
36
 
 
 
 
37
  ROWS_PER_PAGE = 100
38
- MIN_SEARCH_SCORE = 0.3
39
- EXACT_MATCH_BOOST = 2.0
40
  SAVED_INPUTS_DIR = "saved_inputs"
41
  os.makedirs(SAVED_INPUTS_DIR, exist_ok=True)
42
 
@@ -47,7 +41,6 @@ SESSION_VARS = {
47
  'should_rerun': False,
48
  'search_columns': [],
49
  'initial_search_done': False,
50
- 'tts_voice': "en-US-AriaNeural",
51
  'arxiv_last_query': "",
52
  'dataset_loaded': False,
53
  'current_page': 0,
@@ -59,17 +52,14 @@ SESSION_VARS = {
59
  'voice_text': None,
60
  'user_name': random.choice(USER_NAMES),
61
  'max_items': 100,
62
- 'global_voice': "en-US-AriaNeural" # Default global voice
 
63
  }
64
 
65
  for var, default in SESSION_VARS.items():
66
  if var not in st.session_state:
67
  st.session_state[var] = default
68
 
69
- @st.cache_resource
70
- def get_model():
71
- return SentenceTransformer('all-MiniLM-L6-v2')
72
-
73
  def create_voice_component():
74
  mycomponent = components.declare_component(
75
  "mycomponent",
@@ -85,19 +75,17 @@ def clean_for_speech(text: str) -> str:
85
  text = re.sub(r"\s+", " ", text).strip()
86
  return text
87
 
88
- async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0):
89
  text = clean_for_speech(text)
90
  if not text.strip():
91
  return None
92
- rate_str = f"{rate:+d}%"
93
- pitch_str = f"{pitch:+d}Hz"
94
- communicate = edge_tts.Communicate(text, voice, rate=rate_str, pitch=pitch_str)
95
- out_fn = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
96
  await communicate.save(out_fn)
97
  return out_fn
98
 
99
  def speak_with_edge_tts(text, voice="en-US-AriaNeural"):
100
- return asyncio.run(edge_tts_generate_audio(text, voice, 0, 0))
101
 
102
  def play_and_download_audio(file_path):
103
  if file_path and os.path.exists(file_path):
@@ -138,7 +126,6 @@ def list_saved_inputs():
138
  return files
139
 
140
  def parse_md_file(fpath):
141
- # Extract user and text from md
142
  user_line = ""
143
  ts_line = ""
144
  content_lines = []
@@ -154,139 +141,7 @@ def parse_md_file(fpath):
154
  content = "\n".join(content_lines).strip()
155
  return user_line, ts_line, content
156
 
157
- def fetch_dataset_info(dataset_id, token):
158
- info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
159
- try:
160
- response = requests.get(info_url, timeout=30)
161
- if response.status_code == 200:
162
- return response.json()
163
- except Exception:
164
- pass
165
- return None
166
-
167
- @st.cache_data
168
- def get_dataset_info(dataset_id, token):
169
- try:
170
- dataset = load_dataset(dataset_id, token=token, streaming=True)
171
- return dataset['train'].info
172
- except:
173
- return None
174
-
175
- @st.cache_data
176
- def load_dataset_page(dataset_id, token, page, rows_per_page):
177
- try:
178
- start_idx = page * rows_per_page
179
- end_idx = start_idx + rows_per_page
180
- dataset = load_dataset(
181
- dataset_id,
182
- token=token,
183
- streaming=False,
184
- split=f'train[{start_idx}:{end_idx}]'
185
- )
186
- return pd.DataFrame(dataset)
187
- except:
188
- return pd.DataFrame()
189
-
190
- class FastDatasetSearcher:
191
- def __init__(self, dataset_id="tomg-group-umd/cinepile"):
192
- self.dataset_id = dataset_id
193
- self.text_model = get_model()
194
- self.token = os.environ.get('DATASET_KEY')
195
-
196
- def load_page(self, page=0):
197
- return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
198
-
199
- def quick_search(self, query, df):
200
- if df.empty or not query.strip():
201
- return df
202
-
203
- try:
204
- searchable_cols = []
205
- if len(df) > 0:
206
- for col in df.columns:
207
- sample_val = df[col].iloc[0]
208
- if not isinstance(sample_val, (np.ndarray, bytes)):
209
- searchable_cols.append(col)
210
-
211
- query_lower = query.lower()
212
- query_terms = set(query_lower.split())
213
- query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
214
-
215
- scores = []
216
- matched_any = []
217
-
218
- for _, row in df.iterrows():
219
- text_parts = []
220
- row_matched = False
221
- exact_match = False
222
- priority_fields = ['description', 'matched_text']
223
- other_fields = [col for col in searchable_cols if col not in priority_fields]
224
-
225
- for col in priority_fields:
226
- if col in row:
227
- val = row[col]
228
- if val is not None:
229
- val_str = str(val).lower()
230
- if query_lower in val_str.split():
231
- exact_match = True
232
- if any(term in val_str.split() for term in query_terms):
233
- row_matched = True
234
- text_parts.append(str(val))
235
-
236
- for col in other_fields:
237
- val = row[col]
238
- if val is not None:
239
- val_str = str(val).lower()
240
- if query_lower in val_str.split():
241
- exact_match = True
242
- if any(term in val_str.split() for term in query_terms):
243
- row_matched = True
244
- text_parts.append(str(val))
245
-
246
- text = ' '.join(text_parts)
247
- if text.strip():
248
- text_tokens = set(text.lower().split())
249
- matching_terms = query_terms.intersection(text_tokens)
250
- keyword_score = len(matching_terms) / len(query_terms) if len(query_terms) > 0 else 0.0
251
-
252
- text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
253
- semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
254
-
255
- combined_score = 0.7 * keyword_score + 0.3 * semantic_score
256
-
257
- if exact_match:
258
- combined_score *= EXACT_MATCH_BOOST
259
- elif row_matched:
260
- combined_score *= 1.2
261
- else:
262
- combined_score = 0.0
263
- row_matched = False
264
-
265
- scores.append(combined_score)
266
- matched_any.append(row_matched)
267
-
268
- results_df = df.copy()
269
- results_df['score'] = scores
270
- results_df['matched'] = matched_any
271
-
272
- filtered_df = results_df[
273
- (results_df['matched']) |
274
- (results_df['score'] > MIN_SEARCH_SCORE)
275
- ]
276
-
277
- return filtered_df.sort_values('score', ascending=False)
278
- except:
279
- return df
280
-
281
- def play_text(text):
282
- voice = st.session_state.get('global_voice', "en-US-AriaNeural")
283
- audio_file = speak_with_edge_tts(text, voice=voice)
284
- if audio_file:
285
- play_and_download_audio(audio_file)
286
-
287
  def arxiv_search(query, max_results=3):
288
- # Simple arXiv search using RSS (for demonstration)
289
- # In production, use official arXiv API or a library.
290
  base_url = "http://export.arxiv.org/api/query"
291
  params = {
292
  'search_query': query.replace(' ', '+'),
@@ -302,29 +157,75 @@ def arxiv_search(query, max_results=3):
302
  for entry in entries:
303
  title = entry.find('a:title', ns).text.strip()
304
  summary = entry.find('a:summary', ns).text.strip()
305
- # Just truncating summary for demo
306
  summary_short = summary[:300] + "..."
307
  results.append((title, summary_short))
308
  return results
309
  return []
310
 
311
  def summarize_arxiv_results(results):
312
- # Just combine titles and short summaries
313
  lines = []
314
  for i, (title, summary) in enumerate(results, 1):
315
  lines.append(f"Result {i}: {title}\n{summary}\n")
316
  return "\n\n".join(lines)
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  def main():
319
  st.title("πŸŽ™οΈ Voice Chat & Search")
320
 
321
  # Sidebar
322
  with st.sidebar:
323
  # Editable user name
324
- st.session_state['user_name'] = st.text_input("Current User:", value=st.session_state['user_name'])
325
-
326
- # Global voice selection
327
- st.session_state['global_voice'] = st.selectbox("Select Global Voice:", ENGLISH_VOICES, index=0)
328
 
329
  st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
330
 
@@ -339,48 +240,65 @@ def main():
339
  voice_component = create_voice_component()
340
  voice_val = voice_component(my_input_value="Start speaking...")
341
 
342
- # Tabs: Voice Chat History, Arxiv Search, Dataset Search, Settings
343
  tab1, tab2, tab3, tab4 = st.tabs(["πŸ—£οΈ Voice Chat History", "πŸ“š ArXiv Search", "πŸ“Š Dataset Search", "βš™οΈ Settings"])
344
 
345
  # ------------------ Voice Chat History -------------------------
346
  with tab1:
347
  st.subheader("Voice Chat History")
348
- # List saved inputs and responses and allow playing them
349
  files = list_saved_inputs()
350
- for fpath in reversed(files):
 
351
  user, ts, content = parse_md_file(fpath)
 
 
 
 
352
  with st.expander(f"{ts} - {user}", expanded=False):
353
  st.write(content)
354
- if st.button("πŸ”Š Read Aloud", key=f"read_{fpath}"):
355
- play_text(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
  # ------------------ ArXiv Search -------------------------
358
  with tab2:
359
  st.subheader("ArXiv Search")
360
- # If we have a voice_val and autorun with ArXiv chosen:
361
  edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
362
  autorun = st.checkbox("⚑ Auto-Run", value=True)
363
- run_arxiv = st.button("πŸ” ArXiv Search")
364
 
365
  input_changed = (edited_input != st.session_state.get('old_val'))
 
366
  if autorun and input_changed and edited_input.strip():
367
- st.session_state['old_val'] = edited_input
368
- # Save user input
369
- save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
370
- with st.spinner("Searching ArXiv..."):
371
- results = arxiv_search(edited_input)
372
- if results:
373
- summary = summarize_arxiv_results(results)
374
- # Save response
375
- save_response_as_md(st.session_state['user_name'], summary, prefix="response")
376
- st.write(summary)
377
- # Autoplay TTS
378
- play_text(summary)
379
- else:
380
- st.warning("No results found on ArXiv.")
381
-
382
  if run_arxiv and edited_input.strip():
383
- # Manual trigger
 
 
 
 
384
  save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
385
  with st.spinner("Searching ArXiv..."):
386
  results = arxiv_search(edited_input)
@@ -388,33 +306,35 @@ def main():
388
  summary = summarize_arxiv_results(results)
389
  save_response_as_md(st.session_state['user_name'], summary, prefix="response")
390
  st.write(summary)
391
- play_text(summary)
 
 
 
 
392
  else:
393
  st.warning("No results found on ArXiv.")
394
 
395
  # ------------------ Dataset Search -------------------------
396
  with tab3:
397
  st.subheader("Dataset Search")
398
- search = FastDatasetSearcher()
399
  query = st.text_input("Enter dataset search query:")
400
- run_ds_search = st.button("Search Dataset")
401
- num_results = st.slider("Max results:", 1, 100, 20)
402
 
403
  if run_ds_search and query.strip():
404
  with st.spinner("Searching dataset..."):
405
- df = search.load_page()
406
- results = search.quick_search(query, df)
407
- if len(results) > 0:
408
  st.write(f"Found {len(results)} results:")
409
  shown = 0
410
- for i, (_, result) in enumerate(results.iterrows(), 1):
411
  if shown >= num_results:
412
  break
413
  with st.expander(f"Result {i}", expanded=(i==1)):
414
- # Just print result keys/values here
415
- for k, v in result.items():
416
- if k not in ['score', 'matched']:
417
- st.write(f"**{k}:** {v}")
418
  shown += 1
419
  else:
420
  st.warning("No matching results found.")
@@ -422,13 +342,13 @@ def main():
422
  # ------------------ Settings Tab -------------------------
423
  with tab4:
424
  st.subheader("Settings")
425
- st.write("Adjust voice and search parameters in the sidebar.")
426
- if st.button("πŸ—‘οΈ Clear Search History"):
 
 
427
  st.session_state['search_history'] = []
428
- # Optionally delete files:
429
- # for fpath in list_saved_inputs():
430
- # os.remove(fpath)
431
- st.success("Search history cleared!")
432
 
433
  if __name__ == "__main__":
434
  main()
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
 
 
4
  import json
5
  import os
6
  import glob
7
  import random
8
  from pathlib import Path
9
+ from datetime import datetime
10
  import edge_tts
11
  import asyncio
12
  import requests
 
13
  import streamlit.components.v1 as components
 
 
 
14
  import base64
15
  import re
16
+ from xml.etree import ElementTree as ET
17
+ from datasets import load_dataset
18
 
19
  # -------------------- Configuration & Constants --------------------
20
  USER_NAMES = [
21
+ "Aria", "Guy", "Sonia", "Tony", "Jenny", "Davis", "Libby", "Clara", "Liam", "Natasha", "William"
 
 
22
  ]
23
 
24
  ENGLISH_VOICES = [
 
27
  "en-CA-LiamNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural"
28
  ]
29
 
30
+ # Map each user to a corresponding voice
31
+ USER_VOICES = dict(zip(USER_NAMES, ENGLISH_VOICES))
32
+
33
  ROWS_PER_PAGE = 100
 
 
34
  SAVED_INPUTS_DIR = "saved_inputs"
35
  os.makedirs(SAVED_INPUTS_DIR, exist_ok=True)
36
 
 
41
  'should_rerun': False,
42
  'search_columns': [],
43
  'initial_search_done': False,
 
44
  'arxiv_last_query': "",
45
  'dataset_loaded': False,
46
  'current_page': 0,
 
52
  'voice_text': None,
53
  'user_name': random.choice(USER_NAMES),
54
  'max_items': 100,
55
+ 'global_voice': "en-US-AriaNeural",
56
+ 'last_arxiv_input': None
57
  }
58
 
59
  for var, default in SESSION_VARS.items():
60
  if var not in st.session_state:
61
  st.session_state[var] = default
62
 
 
 
 
 
63
  def create_voice_component():
64
  mycomponent = components.declare_component(
65
  "mycomponent",
 
75
  text = re.sub(r"\s+", " ", text).strip()
76
  return text
77
 
78
+ async def edge_tts_generate_audio(text, voice="en-US-AriaNeural"):
79
  text = clean_for_speech(text)
80
  if not text.strip():
81
  return None
82
+ communicate = edge_tts.Communicate(text, voice)
83
+ out_fn = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}.mp3"
 
 
84
  await communicate.save(out_fn)
85
  return out_fn
86
 
87
  def speak_with_edge_tts(text, voice="en-US-AriaNeural"):
88
+ return asyncio.run(edge_tts_generate_audio(text, voice))
89
 
90
  def play_and_download_audio(file_path):
91
  if file_path and os.path.exists(file_path):
 
126
  return files
127
 
128
  def parse_md_file(fpath):
 
129
  user_line = ""
130
  ts_line = ""
131
  content_lines = []
 
141
  content = "\n".join(content_lines).strip()
142
  return user_line, ts_line, content
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def arxiv_search(query, max_results=3):
 
 
145
  base_url = "http://export.arxiv.org/api/query"
146
  params = {
147
  'search_query': query.replace(' ', '+'),
 
157
  for entry in entries:
158
  title = entry.find('a:title', ns).text.strip()
159
  summary = entry.find('a:summary', ns).text.strip()
 
160
  summary_short = summary[:300] + "..."
161
  results.append((title, summary_short))
162
  return results
163
  return []
164
 
165
  def summarize_arxiv_results(results):
 
166
  lines = []
167
  for i, (title, summary) in enumerate(results, 1):
168
  lines.append(f"Result {i}: {title}\n{summary}\n")
169
  return "\n\n".join(lines)
170
 
171
+ def simple_dataset_search(query, df):
172
+ if df.empty or not query.strip():
173
+ return pd.DataFrame()
174
+ query_terms = query.lower().split()
175
+ matches = []
176
+ for idx, row in df.iterrows():
177
+ text_parts = []
178
+ for col in df.columns:
179
+ val = row[col]
180
+ if isinstance(val, str):
181
+ text_parts.append(val.lower())
182
+ elif isinstance(val, (int, float)):
183
+ text_parts.append(str(val))
184
+ full_text = " ".join(text_parts)
185
+ if any(qt in full_text for qt in query_terms):
186
+ matches.append(row)
187
+ if matches:
188
+ return pd.DataFrame(matches)
189
+ return pd.DataFrame()
190
+
191
+ from datasets import load_dataset
192
+
193
+ @st.cache_data
194
+ def load_dataset_page(dataset_id, token, page, rows_per_page):
195
+ try:
196
+ start_idx = page * rows_per_page
197
+ end_idx = start_idx + rows_per_page
198
+ dataset = load_dataset(
199
+ dataset_id,
200
+ token=token,
201
+ streaming=False,
202
+ split=f'train[{start_idx}:{end_idx}]'
203
+ )
204
+ return pd.DataFrame(dataset)
205
+ except:
206
+ return pd.DataFrame()
207
+
208
+ class SimpleDatasetSearcher:
209
+ def __init__(self, dataset_id="tomg-group-umd/cinepile"):
210
+ self.dataset_id = dataset_id
211
+ self.token = os.environ.get('DATASET_KEY')
212
+ def load_page(self, page=0):
213
+ return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
214
+
215
+ def concatenate_mp3(files, output_file):
216
+ # Naive binary concatenation of MP3 files
217
+ with open(output_file, 'wb') as outfile:
218
+ for f in files:
219
+ with open(f, 'rb') as infile:
220
+ outfile.write(infile.read())
221
+
222
  def main():
223
  st.title("πŸŽ™οΈ Voice Chat & Search")
224
 
225
  # Sidebar
226
  with st.sidebar:
227
  # Editable user name
228
+ st.session_state['user_name'] = st.selectbox("Current User:", USER_NAMES, index=0)
 
 
 
229
 
230
  st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
231
 
 
240
  voice_component = create_voice_component()
241
  voice_val = voice_component(my_input_value="Start speaking...")
242
 
243
+ # Tabs
244
  tab1, tab2, tab3, tab4 = st.tabs(["πŸ—£οΈ Voice Chat History", "πŸ“š ArXiv Search", "πŸ“Š Dataset Search", "βš™οΈ Settings"])
245
 
246
  # ------------------ Voice Chat History -------------------------
247
  with tab1:
248
  st.subheader("Voice Chat History")
 
249
  files = list_saved_inputs()
250
+ conversation = []
251
+ for fpath in files:
252
  user, ts, content = parse_md_file(fpath)
253
+ conversation.append((user, ts, content, fpath))
254
+
255
+ # Enumerate to ensure unique keys
256
+ for i, (user, ts, content, fpath) in enumerate(reversed(conversation), start=1):
257
  with st.expander(f"{ts} - {user}", expanded=False):
258
  st.write(content)
259
+ # Make button key unique by including i
260
+ if st.button(f"πŸ”Š Read Aloud {ts}-{user}", key=f"read_{i}_{fpath}"):
261
+ voice = USER_VOICES.get(user, "en-US-AriaNeural")
262
+ audio_file = speak_with_edge_tts(content, voice=voice)
263
+ if audio_file:
264
+ play_and_download_audio(audio_file)
265
+
266
+ # Read entire conversation
267
+ if st.button("πŸ“œ Read Conversation", key="read_conversation_all"):
268
+ # conversation is currently reversed, re-reverse to get chronological
269
+ conversation_chrono = list(reversed(conversation))
270
+ mp3_files = []
271
+ for user, ts, content, fpath in conversation_chrono:
272
+ voice = USER_VOICES.get(user, "en-US-AriaNeural")
273
+ audio_file = speak_with_edge_tts(content, voice=voice)
274
+ if audio_file:
275
+ mp3_files.append(audio_file)
276
+ st.write(f"**{user} ({ts}):**")
277
+ play_and_download_audio(audio_file)
278
+
279
+ if mp3_files:
280
+ combined_file = f"full_conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
281
+ concatenate_mp3(mp3_files, combined_file)
282
+ st.write("**Full Conversation Audio:**")
283
+ play_and_download_audio(combined_file)
284
 
285
  # ------------------ ArXiv Search -------------------------
286
  with tab2:
287
  st.subheader("ArXiv Search")
 
288
  edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
289
  autorun = st.checkbox("⚑ Auto-Run", value=True)
290
+ run_arxiv = st.button("πŸ” ArXiv Search", key="run_arxiv_button")
291
 
292
  input_changed = (edited_input != st.session_state.get('old_val'))
293
+ should_run_arxiv = False
294
  if autorun and input_changed and edited_input.strip():
295
+ should_run_arxiv = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  if run_arxiv and edited_input.strip():
297
+ should_run_arxiv = True
298
+
299
+ if should_run_arxiv and st.session_state['last_arxiv_input'] != edited_input:
300
+ st.session_state['old_val'] = edited_input
301
+ st.session_state['last_arxiv_input'] = edited_input
302
  save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
303
  with st.spinner("Searching ArXiv..."):
304
  results = arxiv_search(edited_input)
 
306
  summary = summarize_arxiv_results(results)
307
  save_response_as_md(st.session_state['user_name'], summary, prefix="response")
308
  st.write(summary)
309
+ # Play summary aloud
310
+ voice = USER_VOICES.get(st.session_state['user_name'], "en-US-AriaNeural")
311
+ audio_file = speak_with_edge_tts(summary, voice=voice)
312
+ if audio_file:
313
+ play_and_download_audio(audio_file)
314
  else:
315
  st.warning("No results found on ArXiv.")
316
 
317
  # ------------------ Dataset Search -------------------------
318
  with tab3:
319
  st.subheader("Dataset Search")
320
+ ds_searcher = SimpleDatasetSearcher()
321
  query = st.text_input("Enter dataset search query:")
322
+ run_ds_search = st.button("Search Dataset", key="ds_search_button")
323
+ num_results = st.slider("Max results:", 1, 100, 20, key="ds_max_results")
324
 
325
  if run_ds_search and query.strip():
326
  with st.spinner("Searching dataset..."):
327
+ df = ds_searcher.load_page(0)
328
+ results = simple_dataset_search(query, df)
329
+ if not results.empty:
330
  st.write(f"Found {len(results)} results:")
331
  shown = 0
332
+ for i, (_, row) in enumerate(results.iterrows(), 1):
333
  if shown >= num_results:
334
  break
335
  with st.expander(f"Result {i}", expanded=(i==1)):
336
+ for k, v in row.items():
337
+ st.write(f"**{k}:** {v}")
 
 
338
  shown += 1
339
  else:
340
  st.warning("No matching results found.")
 
342
  # ------------------ Settings Tab -------------------------
343
  with tab4:
344
  st.subheader("Settings")
345
+ if st.button("πŸ—‘οΈ Clear Search History", key="clear_history"):
346
+ # Delete all files
347
+ for fpath in list_saved_inputs():
348
+ os.remove(fpath)
349
  st.session_state['search_history'] = []
350
+ st.success("Search history cleared for everyone!")
351
+ st.rerun()
 
 
352
 
353
  if __name__ == "__main__":
354
  main()