cosmosai471 commited on
Commit
a7a6d88
Β·
verified Β·
1 Parent(s): 4f7f656

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -192
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # --- (Import statements remain the same) ---
2
  import gradio as gr
3
  import os
4
  import time
@@ -15,36 +14,44 @@ from diffusers import StableDiffusionPipeline
15
  from docx import Document
16
  from pptx import Presentation
17
  from io import BytesIO
18
- import numpy as np # <-- Import NumPy for robust image check
19
 
20
- # --- (CONFIGURATIONS & MODEL LOADING remain the same) ---
21
- STT_DEVICE = "cpu"
22
  os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
23
  AUDIO_DIR = "audio_outputs"
24
- DOC_DIR = "doc_outputs"
25
  if not os.path.exists(AUDIO_DIR):
26
  os.makedirs(AUDIO_DIR)
27
  if not os.path.exists(DOC_DIR):
28
  os.makedirs(DOC_DIR)
29
  REPO_ID = "cosmosai471/Luna-v3"
30
  MODEL_FILE = "luna.gguf"
31
- LOCAL_MODEL_PATH = MODEL_FILE
32
  SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
33
- # --- (safe_del, LLM loading, Pipeline loading remain the same) ---
 
 
 
 
 
 
 
 
 
34
  llm = None
35
  try:
36
  print(f"Downloading {MODEL_FILE} from {REPO_ID}...")
37
  hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILE, local_dir=".")
38
  if not os.path.exists(LOCAL_MODEL_PATH):
39
  raise FileNotFoundError(f"Download failed for {MODEL_FILE}")
40
-
41
  print("Initializing Llama...")
42
  llm = Llama(
43
  model_path=LOCAL_MODEL_PATH,
44
- n_ctx=8192,
45
- n_threads=4,
46
- n_batch=256,
47
- n_gpu_layers=0,
48
  verbose=False
49
  )
50
  print("βœ… Luna Model loaded successfully!")
@@ -64,7 +71,7 @@ except Exception as e:
64
 
65
  image_pipe = None
66
  try:
67
- VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
68
  image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
69
  print(f"βœ… Loaded {VLM_MODEL_ID} for image processing.")
70
  except Exception as e:
@@ -73,43 +80,37 @@ except Exception as e:
73
  img_gen_pipe = None
74
  try:
75
  img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
76
- img_gen_pipe.to(STT_DEVICE)
77
  print("βœ… Loaded Stable Diffusion (v1-5) for image generation.")
78
  except Exception as e:
79
  print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
80
 
 
81
  # --- UTILITY FUNCTIONS ---
82
 
83
  def simulate_recording_delay():
84
  time.sleep(3)
85
- return None
86
 
87
  def clean_response_stream(raw_text: str) -> str:
88
  """Cleans up raw response text by removing tags and repeats."""
89
  clean_text = re.split(r'\nUser:|\nAssistant:|</s>|Intent|Action', raw_text, 1)[0].strip()
90
  clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
91
- # Remove Intent and Confidence tags specifically for display
92
  clean_text = re.sub(r'\[Intent:\s*\w+\]|\[Confidence:\s*\d+\]', '', clean_text).strip()
93
  words = clean_text.split()
94
- if len(words) > 4 and words[-2:] == words[-4:-2]:
95
  clean_text = ' '.join(words[:-2])
96
  return clean_text
97
 
98
  def web_search_tool(query: str) -> str:
99
- time.sleep(1.5)
100
  print(f"Simulating Google Search fallback for: {query}")
101
  return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
102
 
103
- # FIX: Confidence check operates on RAW response string
104
  def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
105
- """
106
- Checks confidence from the raw response tag. Triggers fallback if low.
107
- Returns the *cleaned* response (or augmented one).
108
- """
109
  confidence_match = re.search(r'\[Confidence:\s*(\d+)\]', raw_response_with_tags)
110
  confidence_score = int(confidence_match.group(1)) if confidence_match else 0
111
-
112
- # Always clean the response *after* parsing confidence
113
  cleaned_response = clean_response_stream(raw_response_with_tags)
114
 
115
  if confidence_score < 70:
@@ -118,20 +119,13 @@ def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> st
118
  if "error" in cleaned_response.lower() or confidence_score == 0:
119
  final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
120
  else:
121
- # Append search results to the existing (low confidence) cleaned response
122
  final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
123
  else:
124
- # High confidence, return the already cleaned response
125
  final_response = cleaned_response
126
-
127
  return final_response
128
 
129
- # FIX: Correct VQA prompt format and error handling
130
  def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
131
- """
132
- Uses the VLM pipeline (LLaVA) for VQA.
133
- Returns the prompt injection string and a boolean indicating success.
134
- """
135
  global image_pipe
136
  success = False
137
  if image_pipe is None:
@@ -141,36 +135,27 @@ def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
141
  try:
142
  if isinstance(image_data_or_path, str):
143
  image = Image.open(image_data_or_path).convert("RGB")
144
- elif isinstance(image_data_or_path, np.ndarray): # Handle NumPy array from webcam
145
  image = Image.fromarray(image_data_or_path).convert("RGB")
146
 
147
  if image:
148
- # FIX: Use the specific format required by llava-hf/llava-1.5-7b-hf
149
  vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
150
-
151
- # Increased max_new_tokens for potentially longer VQA responses
152
  results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
153
  raw_vlm_output = results[0]['generated_text'] if results else "Error: VLM did not return text."
154
-
155
- # Extract just the assistant's part
156
  vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
157
- if not vqa_response: # Handle case where split fails or response is empty
158
- vqa_response = "VLM analysis failed or returned empty."
159
 
160
  del image
161
  success = True
162
  prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
163
  return prompt_injection, success
164
-
165
  except Exception as e:
166
  print(f"Image Pipeline Error: {e}")
167
  return f"[Image Processing Error: {e}] **User Query:** {message}", success
168
-
169
- # If image processing failed before VLM call
170
  return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
171
 
172
-
173
- # --- (transcribe_audio, text_to_audio remain the same) ---
174
  def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
175
  if stt_pipe is None or audio_file_path is None:
176
  error_msg = "Error: Whisper model failed to load or no audio recorded."
@@ -179,11 +164,11 @@ def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.upda
179
  transcribed_text = stt_pipe(audio_file_path)["text"]
180
  new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
181
  return (
182
- transcribed_text.strip(),
183
- f"πŸŽ™οΈ Transcribed: '{transcribed_text.strip()}'",
184
- gr.update(interactive=True),
185
- new_button_update,
186
- True,
187
  gr.update(visible=False)
188
  )
189
  except Exception as e:
@@ -192,20 +177,19 @@ def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.upda
192
 
193
  def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
194
  if not is_voice_chat:
195
- return None
196
- clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$|\(file=.*?\)', '', text, flags=re.DOTALL | re.MULTILINE) # Also remove file links for TTS
197
  if len(clean_text.strip()) > 5:
198
  try:
199
  audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
200
  tts = gTTS(text=clean_text.strip(), lang='en')
201
  tts.save(audio_output_path)
202
- return audio_output_path
203
  except Exception as e:
204
  print(f"gTTS Error: {e}")
205
  return None
206
  return None
207
 
208
- # --- (INTENT_STATUS_MAP remains the same) ---
209
  INTENT_STATUS_MAP = {
210
  "code_generate": "Analyzing requirements and drafting code πŸ’»...",
211
  "code_explain": "Reviewing code logic and writing explanation πŸ’‘...",
@@ -220,29 +204,17 @@ INTENT_STATUS_MAP = {
220
  "default": "Luna is thinking...",
221
  }
222
 
223
-
224
- # FIX: Updated get_intent_status to force VQA intent more reliably
225
  def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
226
  """Parses intent/confidence, returns intent, status, cleaned text."""
227
-
228
- # 1. Parse Intent
229
  match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
230
  intent = match.group(1).lower() if match else "default"
231
-
232
- # FIX: Force 'vqa' intent if the flow started with an image, regardless of model output
233
  if is_vqa_flow:
234
  intent = "vqa"
235
-
236
- # 2. Clean Text (remove both tags for display)
237
  cleaned_text = re.sub(r'\[Intent:\s*\w+\]\s*', '', raw_response, count=1).strip()
238
  cleaned_text = re.sub(r'\[Confidence:\s*\d+\]\s*', '', cleaned_text, count=1).strip()
239
-
240
- # 3. Get Status
241
  status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
242
  return intent, status, cleaned_text
243
 
244
-
245
- # --- (generate_file_content remains the same) ---
246
  def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
247
  """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
248
  file_path = None
@@ -282,60 +254,51 @@ def generate_file_content(content: str, history: List[Dict[str, str]], file_type
282
 
283
  # --- CORE GENERATOR FUNCTION ---
284
  def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
285
- """Main generator function for streaming LLM response."""
286
  # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
287
 
288
- # 1. INITIAL HISTORY CHECK
289
  if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
290
  yield history, False, "Error: Generator called in unexpected state.", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
291
  return
292
 
293
- # 2. PRE-PROCESSING & CONTEXT
294
  last_user_index = len(history) - 2
295
- original_message = history[last_user_index]['content'] # Get user msg from history
296
 
297
- # FIX: Robust check for image/file presence using isinstance and None check.
298
  is_vqa_flow = False
299
- if isinstance(image_input_data, str): # File upload path
300
  is_vqa_flow = image_input_data != ""
301
- elif isinstance(image_input_data, np.ndarray): # Webcam data
302
- is_vqa_flow = image_input_data.size > 0 # Check if array is not empty
303
- else: # Could be None or other types
304
  is_vqa_flow = image_input_data is not None
305
 
306
  vqa_success = False
307
  if is_vqa_flow:
308
- # Process image/VQA
309
  processed_message, vqa_success = process_image(image_input_data, original_message)
310
- # Update user message in history to show it was an image prompt
311
  history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
312
- # Use the VQA-enriched message for the LLM prompt
313
  llm_input_message = processed_message
314
  else:
315
  llm_input_message = original_message
316
- image_input_data = None # Ensure cleared if not VQA
317
 
318
- # Build the final prompt string for the LLM
319
  prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
320
- for item in history[:-1]: # Iterate through history up to the current turn
321
  role = item['role'].upper()
322
  content = item['content'] if item['content'] is not None else ""
323
  if role == "ASSISTANT": prompt += f"LUNA: {content}\n"
324
  elif role == "USER": prompt += f"USER: {content}\n"
325
- prompt += f"USER: {llm_input_message}\nLUNA: " # Add final user input
326
 
327
- # 3. HINT BOX & STREAM START
328
- hint_text = "✨ Luna is starting to think..."
329
- history[-1]['content'] = "" # Initialize assistant content
330
  yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
331
- time.sleep(0.5)
332
 
333
- # 4. DIRECT STREAMING
334
  full_response = ""
335
- current_intent = "default"
 
336
  try:
337
  stream = llm.create_completion(
338
- prompt=prompt, max_tokens=8192,
339
  stop=["USER:", "SYSTEM:", "</s>"],
340
  echo=False, stream=True, temperature=0.7
341
  )
@@ -349,9 +312,8 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
349
  for output in stream:
350
  token = output["choices"][0].get("text", "")
351
  full_response += token
352
- # Get intent, status hint, and cleaned text for display
353
- current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success) # Pass VQA success status
354
- history[-1]['content'] = display_text # Update chat display
355
  yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
356
  except Exception as e:
357
  _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
@@ -362,9 +324,8 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
362
 
363
  # 5. POST-PROCESSING & TOOL EXECUTION
364
  file_download_path = None
365
- _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success) # Get final cleaned content
366
 
367
- # 5a. File Generation/Tool Action based on final intent
368
  if current_intent == "image_generate":
369
  yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
370
  history, file_download_path = generate_file_content(content_for_tool, history, "image")
@@ -376,45 +337,33 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
376
  history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
377
  elif current_intent == "open_google":
378
  final_cleaned_response = content_for_tool + "\n\nπŸ”— **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
379
- history[-1]['content'] = final_cleaned_response # Update content
380
  elif current_intent == "open_camera":
381
  final_cleaned_response = content_for_tool + "\n\nπŸ“Έ **Action:** Use the 'Google Lens' button to capture an image."
382
- history[-1]['content'] = final_cleaned_response # Update content
383
 
384
- # 5b. Confidence Check (only if NOT a tool intent)
385
  TOOL_EXECUTION_INTENTS = ["image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"]
386
  if current_intent not in TOOL_EXECUTION_INTENTS:
387
- # Pass the RAW full_response (with tags) to confidence checker
388
  final_response_content = check_confidence_and_augment(full_response, original_message)
389
- history[-1]['content'] = final_response_content # Update content if augmented
390
  else:
391
- # If it was a tool intent, the content is already set (or cleaned implicitly)
392
  final_response_content = history[-1]['content']
393
 
394
- # 5c. TTS Generation
395
- audio_file_path = text_to_audio(final_response_content, is_voice_chat)
396
 
397
- # 6. FINAL YIELD
398
  hint = "βœ… Response generated."
399
- # We yield the path to the hidden file component to make it downloadable
400
- # We yield None to staged_image state to clear it *after* generation
401
  yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(), file_download_path
402
 
403
 
404
  # --- GRADIO WRAPPERS FOR UI ACTIONS ---
405
 
406
  def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
407
- new_visibility = not current_visibility
408
  return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "βž•")
409
 
410
- # FIX: user_turn now only adds history if input exists, DOES NOT clear staged_image
411
  def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
412
- """
413
- Appends the user message to the chat history if text or image is provided.
414
- Clears the input box. Does NOT clear the staged_image state here.
415
- """
416
  has_text = bool(user_message)
417
- # Robust check for image presence
418
  has_image = False
419
  if isinstance(staged_image_input, str):
420
  has_image = staged_image_input != ""
@@ -423,25 +372,20 @@ def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_imag
423
  else:
424
  has_image = staged_image_input is not None
425
 
426
- # If no input, do nothing
427
  if not has_text and not has_image:
428
- return user_message, chat_history # Return original inputs
429
 
430
- # If the last turn is still generating, do nothing to prevent race conditions
431
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
432
  return user_message, chat_history
433
 
434
- # Determine message content
435
  if not has_text and has_image:
436
  user_message_to_add = "Analyzing Staged Media."
437
  else:
438
  user_message_to_add = user_message
439
 
440
- # Add messages to history
441
- chat_history.append({"role": "user", "content": user_message_to_add})
442
- chat_history.append({"role": "assistant", "content": ""}) # Add placeholder
443
 
444
- # Clear only the text input box
445
  return "", chat_history
446
 
447
  def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
@@ -449,12 +393,10 @@ def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
449
  return file_path, f"πŸ“Ž File staged: {os.path.basename(file_path)}. Click send (✈️).", gr.update(value="", interactive=True), gr.update(interactive=False)
450
  return None, "File upload cancelled.", gr.update(value="", interactive=True), gr.update(interactive=False)
451
 
452
- # FIX: Reinstate clear_staged_media
453
  def clear_staged_media() -> gr.update:
454
  """Clears the staged media state component."""
455
  return gr.update(value=None)
456
 
457
- # --- (manual_fact_check, auto_capture_camera remain largely the same, ensure they use history format correctly) ---
458
  def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
459
  if not history or not history[-1]['content']:
460
  return history, "Error: No final response to check.", gr.update(visible=False)
@@ -470,44 +412,41 @@ def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str
470
  return new_history, "βœ… Double-checked with web facts.", gr.update(visible=False)
471
 
472
  def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]], Any, gr.update, gr.update, gr.update, gr.update, gr.update]:
473
- # Use user_turn logic to setup the chat history correctly for the intent flow
474
- _, chat_history = user_turn(user_message, chat_history, staged_image_input) # Pass staged image
475
- # Update the last assistant response placeholder with a status message
476
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
477
  chat_history[-1]['content'] = "πŸ“Έ Preparing camera capture..."
478
- # Update UI to show the webcam (start capture simulation)
479
- # Note: staged_image is NOT cleared here by user_turn
480
  return "", chat_history, staged_image_input, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value="πŸ“Έ Capturing in 3 seconds...", interactive=False), gr.update(value="βž•")
481
 
482
 
483
  # --- GRADIO INTERFACE ---
484
 
485
  with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
486
-
487
- # --- (State Components remain the same) ---
488
  stop_signal = gr.State(value=False)
489
- is_voice_chat = gr.State(value=False)
490
- staged_image = gr.State(value=None)
491
  menu_visible_state = gr.State(value=False)
492
-
493
  gr.HTML("<h1 style='text-align: center; color: #4B0082;'>πŸŒ™ Luna Chat Space</h1>")
494
- hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
495
- file_download_output = gr.File(label="Generated File", visible=False) # Hidden file component
 
496
 
497
  with gr.Row(visible=False) as fact_check_btn_row:
498
  gr.Column(min_width=1); btn_fact_check = gr.Button("Fact Check πŸ”Ž"); gr.Column(min_width=1)
499
 
500
- chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
501
-
502
  with gr.Row(visible=False) as webcam_capture_row:
503
  webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
504
  close_webcam_btn = gr.Button("βœ… Use this image")
505
-
506
  with gr.Row(visible=False) as audio_record_row:
507
  audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
508
-
509
  with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
510
- file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
511
  btn_take_photo = gr.Button("πŸ“Έ Google Lens (Take Photo)")
512
  btn_add_files = gr.Button("πŸ“Ž Upload File")
513
 
@@ -516,79 +455,74 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
516
  txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
517
  mic_btn = gr.Button("πŸŽ™οΈ", interactive=True, size="sm")
518
  combined_btn = gr.Button("✈️", variant="primary", size="sm")
 
 
519
 
520
- audio_output = gr.Audio(visible=False)
521
-
522
- # Output components list now reflects the hidden file component
523
  output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
524
 
525
  # --- WIRE EVENTS ---
526
 
527
- # --- (Menu, File Upload, Take Photo events remain the same) ---
528
- btn_menu.click(fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False)
 
 
529
  def prepare_file_upload(): return gr.update(visible=False), gr.update(value="βž•"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
530
  btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
531
- file_input.change(fn=stage_file_upload, inputs=[file_input], outputs=[staged_image, hint_box, txt, file_input], queue=False)
532
- btn_take_photo.click(fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "πŸ“Έ Camera Active. Capture an image.", gr.update(value="βž•")), inputs=[], outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu], queue=False)
 
 
533
 
534
- # Webcam Close (stages the image data/path)
 
 
 
 
535
  close_webcam_btn.click(
536
  fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"πŸ“Έ Photo staged: Click send (✈️).", gr.update(value="")),
537
- inputs=[webcam_capture_component],
538
- outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt], # staged_image gets the NumPy array here
539
- queue=False
540
  )
541
-
542
- # --- (Mic wiring remains the same, ensure user_turn includes staged_image) ---
543
- mic_btn.click(fn=lambda: (gr.update(visible=False), gr.update(visible=True), "πŸŽ™οΈ Recording..."), inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False)\
544
- .then(fn=simulate_recording_delay, inputs=[], outputs=[], queue=False)\
545
- .then(fn=lambda: (gr.update(visible=True), gr.update(visible=False), "πŸŽ™οΈ Processing..."), inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False)\
546
- .then(fn=transcribe_audio, inputs=audio_input, outputs=[txt, hint_box, txt, combined_btn, is_voice_chat, fact_check_btn_row], queue=False)\
547
- .then(fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False) # staged_image is passed but not modified here
548
- .then(
549
- fn=chat_generator,
550
- inputs=[txt, staged_image, chatbot, stop_signal, is_voice_chat], # staged_image is read here
551
- outputs=output_components,
552
- queue=True,
553
  ).then(
554
- fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear staged_image AFTER generation
 
 
 
 
 
 
 
 
 
 
 
555
  )
556
 
557
-
558
- # Main Submission Logic
559
- # FIX: Pass staged_image to user_turn, but DO NOT modify it there.
560
- # Clear staged_image using clear_staged_media *after* chat_generator runs.
561
  generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
562
-
 
563
  txt.submit(
564
- fn=user_turn,
565
- inputs=[txt, chatbot, staged_image], # Pass staged_image state
566
- outputs=[txt, chatbot], # user_turn only outputs text and history
567
- queue=False
568
  ).then(
569
- fn=chat_generator,
570
- inputs=generator_inputs, # Use the state value here
571
- outputs=output_components,
572
- queue=True,
573
  ).then(
574
- fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear state AFTER generation
575
  )
576
-
 
577
  combined_btn.click(
578
- fn=user_turn,
579
- inputs=[txt, chatbot, staged_image], # Pass staged_image state
580
- outputs=[txt, chatbot], # user_turn only outputs text and history
581
- queue=False
582
  ).then(
583
- fn=chat_generator,
584
- inputs=generator_inputs, # Use the state value here
585
- outputs=output_components,
586
- queue=True,
587
  ).then(
588
- fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear state AFTER generation
 
 
 
 
589
  )
590
 
591
- # --- (Fact Check event remains the same) ---
592
- btn_fact_check.click(fn=manual_fact_check, inputs=[chatbot], outputs=[chatbot, hint_box, fact_check_btn_row], queue=True)
593
-
594
- demo.queue(max_size=20).launch(server_name="0.0.0.0")
 
 
1
  import gradio as gr
2
  import os
3
  import time
 
14
  from docx import Document
15
  from pptx import Presentation
16
  from io import BytesIO
17
+ import numpy as np
18
 
19
+ # --- CONFIGURATION & INITIALIZATION ---
20
+ STT_DEVICE = "cpu"
21
  os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
22
  AUDIO_DIR = "audio_outputs"
23
+ DOC_DIR = "doc_outputs"
24
  if not os.path.exists(AUDIO_DIR):
25
  os.makedirs(AUDIO_DIR)
26
  if not os.path.exists(DOC_DIR):
27
  os.makedirs(DOC_DIR)
28
  REPO_ID = "cosmosai471/Luna-v3"
29
  MODEL_FILE = "luna.gguf"
30
+ LOCAL_MODEL_PATH = MODEL_FILE
31
  SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
32
+
33
+ def safe_del(self):
34
+ try:
35
+ if hasattr(self, "close") and callable(self.close):
36
+ self.close()
37
+ except Exception:
38
+ pass
39
+ Llama.__del__ = safe_del
40
+
41
+ # --- MODEL LOADING ---
42
  llm = None
43
  try:
44
  print(f"Downloading {MODEL_FILE} from {REPO_ID}...")
45
  hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILE, local_dir=".")
46
  if not os.path.exists(LOCAL_MODEL_PATH):
47
  raise FileNotFoundError(f"Download failed for {MODEL_FILE}")
 
48
  print("Initializing Llama...")
49
  llm = Llama(
50
  model_path=LOCAL_MODEL_PATH,
51
+ n_ctx=8192,
52
+ n_threads=4,
53
+ n_batch=256,
54
+ n_gpu_layers=0,
55
  verbose=False
56
  )
57
  print("βœ… Luna Model loaded successfully!")
 
71
 
72
  image_pipe = None
73
  try:
74
+ VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
75
  image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
76
  print(f"βœ… Loaded {VLM_MODEL_ID} for image processing.")
77
  except Exception as e:
 
80
  img_gen_pipe = None
81
  try:
82
  img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
83
+ img_gen_pipe.to(STT_DEVICE)
84
  print("βœ… Loaded Stable Diffusion (v1-5) for image generation.")
85
  except Exception as e:
86
  print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
87
 
88
+
89
  # --- UTILITY FUNCTIONS ---
90
 
91
  def simulate_recording_delay():
92
  time.sleep(3)
93
+ return None
94
 
95
  def clean_response_stream(raw_text: str) -> str:
96
  """Cleans up raw response text by removing tags and repeats."""
97
  clean_text = re.split(r'\nUser:|\nAssistant:|</s>|Intent|Action', raw_text, 1)[0].strip()
98
  clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
 
99
  clean_text = re.sub(r'\[Intent:\s*\w+\]|\[Confidence:\s*\d+\]', '', clean_text).strip()
100
  words = clean_text.split()
101
+ if len(words) > 4 and words[-2:] == words[-4:-2]:
102
  clean_text = ' '.join(words[:-2])
103
  return clean_text
104
 
105
  def web_search_tool(query: str) -> str:
106
+ time.sleep(1.5)
107
  print(f"Simulating Google Search fallback for: {query}")
108
  return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
109
 
 
110
  def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
111
+ """Checks confidence from the raw response tag. Triggers fallback if low."""
 
 
 
112
  confidence_match = re.search(r'\[Confidence:\s*(\d+)\]', raw_response_with_tags)
113
  confidence_score = int(confidence_match.group(1)) if confidence_match else 0
 
 
114
  cleaned_response = clean_response_stream(raw_response_with_tags)
115
 
116
  if confidence_score < 70:
 
119
  if "error" in cleaned_response.lower() or confidence_score == 0:
120
  final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
121
  else:
 
122
  final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
123
  else:
 
124
  final_response = cleaned_response
 
125
  return final_response
126
 
 
127
  def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
128
+ """Uses the VLM pipeline (LLaVA) for VQA."""
 
 
 
129
  global image_pipe
130
  success = False
131
  if image_pipe is None:
 
135
  try:
136
  if isinstance(image_data_or_path, str):
137
  image = Image.open(image_data_or_path).convert("RGB")
138
+ elif isinstance(image_data_or_path, np.ndarray):
139
  image = Image.fromarray(image_data_or_path).convert("RGB")
140
 
141
  if image:
 
142
  vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
 
 
143
  results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
144
  raw_vlm_output = results[0]['generated_text'] if results else "Error: VLM did not return text."
 
 
145
  vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
146
+ if not vqa_response: vqa_response = "VLM analysis failed or returned empty."
 
147
 
148
  del image
149
  success = True
150
  prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
151
  return prompt_injection, success
152
+
153
  except Exception as e:
154
  print(f"Image Pipeline Error: {e}")
155
  return f"[Image Processing Error: {e}] **User Query:** {message}", success
156
+
 
157
  return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
158
 
 
 
159
  def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
160
  if stt_pipe is None or audio_file_path is None:
161
  error_msg = "Error: Whisper model failed to load or no audio recorded."
 
164
  transcribed_text = stt_pipe(audio_file_path)["text"]
165
  new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
166
  return (
167
+ transcribed_text.strip(),
168
+ f"πŸŽ™οΈ Transcribed: '{transcribed_text.strip()}'",
169
+ gr.update(interactive=True),
170
+ new_button_update,
171
+ True,
172
  gr.update(visible=False)
173
  )
174
  except Exception as e:
 
177
 
178
  def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
179
  if not is_voice_chat:
180
+ return None
181
+ clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$|\(file=.*?\)', '', text, flags=re.DOTALL | re.MULTILINE)
182
  if len(clean_text.strip()) > 5:
183
  try:
184
  audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
185
  tts = gTTS(text=clean_text.strip(), lang='en')
186
  tts.save(audio_output_path)
187
+ return audio_output_path
188
  except Exception as e:
189
  print(f"gTTS Error: {e}")
190
  return None
191
  return None
192
 
 
193
  INTENT_STATUS_MAP = {
194
  "code_generate": "Analyzing requirements and drafting code πŸ’»...",
195
  "code_explain": "Reviewing code logic and writing explanation πŸ’‘...",
 
204
  "default": "Luna is thinking...",
205
  }
206
 
 
 
207
  def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
208
  """Parses intent/confidence, returns intent, status, cleaned text."""
 
 
209
  match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
210
  intent = match.group(1).lower() if match else "default"
 
 
211
  if is_vqa_flow:
212
  intent = "vqa"
 
 
213
  cleaned_text = re.sub(r'\[Intent:\s*\w+\]\s*', '', raw_response, count=1).strip()
214
  cleaned_text = re.sub(r'\[Confidence:\s*\d+\]\s*', '', cleaned_text, count=1).strip()
 
 
215
  status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
216
  return intent, status, cleaned_text
217
 
 
 
218
  def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
219
  """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
220
  file_path = None
 
254
 
255
  # --- CORE GENERATOR FUNCTION ---
256
  def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
 
257
  # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
258
 
 
259
  if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
260
  yield history, False, "Error: Generator called in unexpected state.", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
261
  return
262
 
 
263
  last_user_index = len(history) - 2
264
+ original_message = history[last_user_index]['content']
265
 
 
266
  is_vqa_flow = False
267
+ if isinstance(image_input_data, str):
268
  is_vqa_flow = image_input_data != ""
269
+ elif isinstance(image_input_data, np.ndarray):
270
+ is_vqa_flow = image_input_data.size > 0
271
+ else:
272
  is_vqa_flow = image_input_data is not None
273
 
274
  vqa_success = False
275
  if is_vqa_flow:
 
276
  processed_message, vqa_success = process_image(image_input_data, original_message)
 
277
  history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
 
278
  llm_input_message = processed_message
279
  else:
280
  llm_input_message = original_message
281
+ image_input_data = None
282
 
 
283
  prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
284
+ for item in history[:-1]:
285
  role = item['role'].upper()
286
  content = item['content'] if item['content'] is not None else ""
287
  if role == "ASSISTANT": prompt += f"LUNA: {content}\n"
288
  elif role == "USER": prompt += f"USER: {content}\n"
289
+ prompt += f"USER: {llm_input_message}\nLUNA: "
290
 
291
+ hint_text = "✨ Luna is starting to think..."
292
+ history[-1]['content'] = ""
 
293
  yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
294
+ time.sleep(0.5)
295
 
 
296
  full_response = ""
297
+ current_intent = "default"
298
+
299
  try:
300
  stream = llm.create_completion(
301
+ prompt=prompt, max_tokens=8192,
302
  stop=["USER:", "SYSTEM:", "</s>"],
303
  echo=False, stream=True, temperature=0.7
304
  )
 
312
  for output in stream:
313
  token = output["choices"][0].get("text", "")
314
  full_response += token
315
+ current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
316
+ history[-1]['content'] = display_text
 
317
  yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
318
  except Exception as e:
319
  _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
 
324
 
325
  # 5. POST-PROCESSING & TOOL EXECUTION
326
  file_download_path = None
327
+ _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success)
328
 
 
329
  if current_intent == "image_generate":
330
  yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
331
  history, file_download_path = generate_file_content(content_for_tool, history, "image")
 
337
  history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
338
  elif current_intent == "open_google":
339
  final_cleaned_response = content_for_tool + "\n\nπŸ”— **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
340
+ history[-1]['content'] = final_cleaned_response
341
  elif current_intent == "open_camera":
342
  final_cleaned_response = content_for_tool + "\n\nπŸ“Έ **Action:** Use the 'Google Lens' button to capture an image."
343
+ history[-1]['content'] = final_cleaned_response
344
 
 
345
  TOOL_EXECUTION_INTENTS = ["image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"]
346
  if current_intent not in TOOL_EXECUTION_INTENTS:
 
347
  final_response_content = check_confidence_and_augment(full_response, original_message)
348
+ history[-1]['content'] = final_response_content
349
  else:
 
350
  final_response_content = history[-1]['content']
351
 
352
+ audio_file_path = text_to_audio(final_response_content, is_voice_chat)
 
353
 
 
354
  hint = "βœ… Response generated."
 
 
355
  yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(), file_download_path
356
 
357
 
358
  # --- GRADIO WRAPPERS FOR UI ACTIONS ---
359
 
360
  def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
361
+ new_visibility = not current_visibility
362
  return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "βž•")
363
 
 
364
  def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
365
+ """Appends the user message to the chat history if text or image is provided."""
 
 
 
366
  has_text = bool(user_message)
 
367
  has_image = False
368
  if isinstance(staged_image_input, str):
369
  has_image = staged_image_input != ""
 
372
  else:
373
  has_image = staged_image_input is not None
374
 
 
375
  if not has_text and not has_image:
376
+ return user_message, chat_history
377
 
 
378
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
379
  return user_message, chat_history
380
 
 
381
  if not has_text and has_image:
382
  user_message_to_add = "Analyzing Staged Media."
383
  else:
384
  user_message_to_add = user_message
385
 
386
+ chat_history.append({"role": "user", "content": user_message_to_add})
387
+ chat_history.append({"role": "assistant", "content": ""})
 
388
 
 
389
  return "", chat_history
390
 
391
  def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
 
393
  return file_path, f"πŸ“Ž File staged: {os.path.basename(file_path)}. Click send (✈️).", gr.update(value="", interactive=True), gr.update(interactive=False)
394
  return None, "File upload cancelled.", gr.update(value="", interactive=True), gr.update(interactive=False)
395
 
 
396
  def clear_staged_media() -> gr.update:
397
  """Clears the staged media state component."""
398
  return gr.update(value=None)
399
 
 
400
  def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
401
  if not history or not history[-1]['content']:
402
  return history, "Error: No final response to check.", gr.update(visible=False)
 
412
  return new_history, "βœ… Double-checked with web facts.", gr.update(visible=False)
413
 
414
  def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]], Any, gr.update, gr.update, gr.update, gr.update, gr.update]:
415
+ _, chat_history = user_turn(user_message, chat_history, staged_image_input)
 
 
416
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
417
  chat_history[-1]['content'] = "πŸ“Έ Preparing camera capture..."
 
 
418
  return "", chat_history, staged_image_input, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value="πŸ“Έ Capturing in 3 seconds...", interactive=False), gr.update(value="βž•")
419
 
420
 
421
  # --- GRADIO INTERFACE ---
422
 
423
  with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
424
+
425
+ # --- State Components ---
426
  stop_signal = gr.State(value=False)
427
+ is_voice_chat = gr.State(value=False)
428
+ staged_image = gr.State(value=None)
429
  menu_visible_state = gr.State(value=False)
430
+
431
  gr.HTML("<h1 style='text-align: center; color: #4B0082;'>πŸŒ™ Luna Chat Space</h1>")
432
+
433
+ hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
434
+ file_download_output = gr.File(label="Generated File", visible=False)
435
 
436
  with gr.Row(visible=False) as fact_check_btn_row:
437
  gr.Column(min_width=1); btn_fact_check = gr.Button("Fact Check πŸ”Ž"); gr.Column(min_width=1)
438
 
439
+ chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
440
+
441
  with gr.Row(visible=False) as webcam_capture_row:
442
  webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
443
  close_webcam_btn = gr.Button("βœ… Use this image")
444
+
445
  with gr.Row(visible=False) as audio_record_row:
446
  audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
447
+
448
  with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
449
+ file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
450
  btn_take_photo = gr.Button("πŸ“Έ Google Lens (Take Photo)")
451
  btn_add_files = gr.Button("πŸ“Ž Upload File")
452
 
 
455
  txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
456
  mic_btn = gr.Button("πŸŽ™οΈ", interactive=True, size="sm")
457
  combined_btn = gr.Button("✈️", variant="primary", size="sm")
458
+
459
+ audio_output = gr.Audio(visible=False)
460
 
 
 
 
461
  output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
462
 
463
  # --- WIRE EVENTS ---
464
 
465
+ btn_menu.click(
466
+ fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False
467
+ )
468
+
469
  def prepare_file_upload(): return gr.update(visible=False), gr.update(value="βž•"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
470
  btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
471
+
472
+ file_input.change(
473
+ fn=stage_file_upload, inputs=[file_input], outputs=[staged_image, hint_box, txt, file_input], queue=False
474
+ )
475
 
476
+ btn_take_photo.click(
477
+ fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "πŸ“Έ Camera Active. Capture an image.", gr.update(value="βž•")),
478
+ inputs=[], outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu], queue=False
479
+ )
480
+
481
  close_webcam_btn.click(
482
  fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"πŸ“Έ Photo staged: Click send (✈️).", gr.update(value="")),
483
+ inputs=[webcam_capture_component], outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt], queue=False
 
 
484
  )
485
+
486
+ mic_btn.click(
487
+ fn=lambda: (gr.update(visible=False), gr.update(visible=True), "πŸŽ™οΈ Recording..."),
488
+ inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False
 
 
 
 
 
 
 
 
489
  ).then(
490
+ fn=simulate_recording_delay, inputs=[], outputs=[], queue=False
491
+ ).then(
492
+ fn=lambda: (gr.update(visible=True), gr.update(visible=False), "πŸŽ™οΈ Processing recording..."),
493
+ inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False
494
+ ).then(
495
+ fn=transcribe_audio, inputs=audio_input, outputs=[txt, hint_box, txt, combined_btn, is_voice_chat, fact_check_btn_row], queue=False
496
+ ).then(
497
+ fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
498
+ ).then(
499
+ fn=chat_generator, inputs=[txt, staged_image, chatbot, stop_signal, is_voice_chat], outputs=output_components, queue=True
500
+ ).then(
501
+ fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
502
  )
503
 
 
 
 
 
504
  generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
505
+
506
+ # Text submit (Enter key)
507
  txt.submit(
508
+ fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
 
 
 
509
  ).then(
510
+ fn=chat_generator, inputs=generator_inputs, outputs=output_components, queue=True
 
 
 
511
  ).then(
512
+ fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
513
  )
514
+
515
+ # Send button click
516
  combined_btn.click(
517
+ fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
 
 
 
518
  ).then(
519
+ fn=chat_generator, inputs=generator_inputs, outputs=output_components, queue=True
 
 
 
520
  ).then(
521
+ fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
522
+ )
523
+
524
+ btn_fact_check.click(
525
+ fn=manual_fact_check, inputs=[chatbot], outputs=[chatbot, hint_box, fact_check_btn_row], queue=True
526
  )
527
 
528
+ demo.queue(max_size=20).launch(server_name="0.0.0.0")