cosmosai471 commited on
Commit
f6b95b3
ยท
verified ยท
1 Parent(s): 95abdee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -30
app.py CHANGED
@@ -138,12 +138,24 @@ def check_confidence_and_augment(raw_response: str, prompt: str) -> str:
138
 
139
  return final_response
140
 
141
- def process_image(image_path: str, message: str) -> str:
142
  """Uses the VLM pipeline (LLaVA) for Visual Question Answering (VQA)."""
143
  global image_pipe
144
- if image_path and image_pipe:
145
- try:
146
- image = Image.open(image_path).convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
147
  vqa_prompt = f"USER: {message}\nASSISTANT:"
148
 
149
  results = image_pipe(image, prompt=vqa_prompt)
@@ -152,9 +164,10 @@ def process_image(image_path: str, message: str) -> str:
152
 
153
  prompt_injection = f"**Image Analysis (VQA):** {vqa_response}\n\n**User Query:** {message}"
154
  return prompt_injection
155
- except Exception as e:
156
- print(f"Image Pipeline Error: {e}")
157
- return f"[Image Processing Error: {e}] **User Query:** {message}"
 
158
 
159
  return message
160
 
@@ -218,6 +231,7 @@ INTENT_STATUS_MAP = {
218
  def get_intent_status(raw_response: str, is_vqa: bool) -> Tuple[str, str, str]:
219
  """Parses the Intent tag from the model's raw response and returns the intent, status, and cleaned response."""
220
  if is_vqa and "Image Analysis (VQA)" in raw_response:
 
221
  return "vqa", INTENT_STATUS_MAP["vqa"], raw_response
222
 
223
  match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
@@ -308,9 +322,7 @@ def generate_ppt_and_update_history(content: str, history: List[Dict[str, str]])
308
 
309
  # --- CORE GENERATOR FUNCTION ---
310
 
311
- # --- CORE GENERATOR FUNCTION ---
312
-
313
- def chat_generator(message: str, image_path: str, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
314
  """The main generator function for streaming the LLM response."""
315
 
316
  # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
@@ -325,17 +337,19 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
325
  last_user_index = len(history) - 2
326
  original_message = history[last_user_index]['content']
327
 
328
- # Safely check if image_path contains a non-empty string path
329
- is_vqa_flow = bool(image_path) and isinstance(image_path, str)
 
 
330
 
331
  if is_vqa_flow:
332
- # Pre-process image/VQA
333
  message = process_image(image_path, original_message)
334
  # Update the user's content to reflect VQA flow for context building
335
  history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
336
  else:
337
  message = original_message
338
- image_path = None
339
 
340
  # Build the prompt with conversation history (Context)
341
  prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
@@ -428,7 +442,6 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
428
  elif current_intent == "open_camera":
429
  final_response += "\n\n๐Ÿ“ธ **Action:** I cannot directly open the camera within this chat stream, but I will prepare the UI for you to use the 'Google Lens' button if you click 'Send' now!"
430
 
431
- # --- FIX START ---
432
  # List of intents that indicate the response is sufficient for the tool/VQA action
433
  # and should bypass the short-response/low-confidence Google search fallback.
434
  TOOL_EXECUTION_INTENTS = [
@@ -438,7 +451,6 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
438
  # If no download file was created AND the intent is NOT a tool/VQA intent, perform confidence check.
439
  if file_download_path is None and current_intent not in TOOL_EXECUTION_INTENTS:
440
  final_response = check_confidence_and_augment(final_response, original_message)
441
- # --- FIX END ---
442
 
443
  audio_file_path = text_to_audio(final_response, is_voice_chat)
444
 
@@ -460,24 +472,28 @@ def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, g
460
  def user_turn(user_message: str, chat_history: List[Dict[str, str]]) -> Tuple[str, List[Dict[str, str]]]:
461
  """Appends the user message to the chat history and clears the input box, using the 'messages' format."""
462
 
 
463
  if not user_message and not chat_history:
464
- pass
465
 
466
- # If the last message is an incomplete assistant message, and no new user message is provided, don't update
 
467
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "" and not user_message:
468
  return "", chat_history
469
 
470
- if user_message:
471
- # Append the new user message
472
- chat_history.append({"role": "user", "content": user_message})
473
- # Append a placeholder for the assistant's response. CHANGE: Use "" instead of None.
474
- chat_history.append({"role": "assistant", "content": ""}) # <<< FIX APPLIED HERE
 
475
 
476
  return "", chat_history
477
 
478
- def stage_file_upload(file_path: str) -> Tuple[str, str, gr.update, gr.update]:
479
  """Stages the file path and updates the hint box."""
480
  if file_path:
 
481
  return file_path, f"๐Ÿ“Ž File staged: {os.path.basename(file_path)}. Click send (โœˆ๏ธ) to analyze.", gr.update(value="", interactive=True), gr.update(interactive=False)
482
  return None, "File upload cancelled/cleared.", gr.update(value="", interactive=True), gr.update(interactive=False)
483
 
@@ -494,7 +510,8 @@ def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str
494
  last_user_prompt = ""
495
  for item in reversed(history):
496
  if item['role'] == 'user' and item['content']:
497
- last_user_prompt = item['content'].split("**User Query:**")[-1].strip()
 
498
  break
499
 
500
  if not last_user_prompt:
@@ -517,7 +534,7 @@ def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]]) -
517
  _, chat_history = user_turn(user_message, chat_history)
518
 
519
  # Update the last assistant response placeholder with a status message
520
- if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] is None:
521
  chat_history[-1]['content'] = "๐Ÿ“ธ Preparing camera capture..."
522
 
523
  # Update UI to show the webcam (start capture simulation)
@@ -549,12 +566,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
549
  gr.Column(min_width=1)
550
 
551
  # Chatbot Area
552
- # --- FIX: Added type='messages' to comply with new Gradio standard ---
553
  chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
554
 
555
  # Webcam Capture Area (Hidden)
556
  with gr.Row(visible=False) as webcam_capture_row:
557
- webcam_capture_component = gr.Image(sources=["webcam"], show_label=False)
 
558
  close_webcam_btn = gr.Button("โœ… Use this image")
559
 
560
  # Audio Recording Row (Hidden)
@@ -618,14 +635,14 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
618
  queue=False
619
  )
620
 
621
- # 5. Mic wiring (Fixed with simulate_recording_delay)
622
  mic_btn.click(
623
  fn=lambda: (gr.update(visible=False), gr.update(visible=True), "๐ŸŽ™๏ธ Recording..."),
624
  inputs=[],
625
  outputs=[input_row, audio_record_row, hint_box],
626
  queue=False
627
  ).then(
628
- fn=simulate_recording_delay, # <<< NEW STEP FOR DELAY
629
  inputs=[],
630
  outputs=[],
631
  queue=False,
 
138
 
139
  return final_response
140
 
141
+ def process_image(image_data_or_path: Any, message: str) -> str:
142
  """Uses the VLM pipeline (LLaVA) for Visual Question Answering (VQA)."""
143
  global image_pipe
144
+
145
+ if image_pipe is None:
146
+ return f"[Image Processing Error: VLM model is not loaded.] **User Query:** {message}"
147
+
148
+ image = None
149
+ try:
150
+ # Check if it's a file path string
151
+ if isinstance(image_data_or_path, str):
152
+ image = Image.open(image_data_or_path).convert("RGB")
153
+ # Check if it's raw image data (e.g., NumPy array from webcam)
154
+ elif image_data_or_path is not None:
155
+ # Gradio often returns image data as a numpy array, which PIL can handle
156
+ image = Image.fromarray(image_data_or_path).convert("RGB")
157
+
158
+ if image:
159
  vqa_prompt = f"USER: {message}\nASSISTANT:"
160
 
161
  results = image_pipe(image, prompt=vqa_prompt)
 
164
 
165
  prompt_injection = f"**Image Analysis (VQA):** {vqa_response}\n\n**User Query:** {message}"
166
  return prompt_injection
167
+
168
+ except Exception as e:
169
+ print(f"Image Pipeline Error: {e}")
170
+ return f"[Image Processing Error: {e}] **User Query:** {message}"
171
 
172
  return message
173
 
 
231
  def get_intent_status(raw_response: str, is_vqa: bool) -> Tuple[str, str, str]:
232
  """Parses the Intent tag from the model's raw response and returns the intent, status, and cleaned response."""
233
  if is_vqa and "Image Analysis (VQA)" in raw_response:
234
+ # If we have VQA content in the response, treat the intent as vqa
235
  return "vqa", INTENT_STATUS_MAP["vqa"], raw_response
236
 
237
  match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
 
322
 
323
  # --- CORE GENERATOR FUNCTION ---
324
 
325
+ def chat_generator(message: str, image_path: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
 
 
326
  """The main generator function for streaming the LLM response."""
327
 
328
  # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
 
337
  last_user_index = len(history) - 2
338
  original_message = history[last_user_index]['content']
339
 
340
+ # FIX: Robust check for image/file presence. Avoids ambiguous truth value error.
341
+ # An image is staged if image_path is not None and not an empty string.
342
+ # We allow image_path to be raw data (like a NumPy array) or a string path.
343
+ is_vqa_flow = image_path is not None and image_path != ""
344
 
345
  if is_vqa_flow:
346
+ # Process image/VQA
347
  message = process_image(image_path, original_message)
348
  # Update the user's content to reflect VQA flow for context building
349
  history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
350
  else:
351
  message = original_message
352
+ image_path = None # Clear image_path for final yield
353
 
354
  # Build the prompt with conversation history (Context)
355
  prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
 
442
  elif current_intent == "open_camera":
443
  final_response += "\n\n๐Ÿ“ธ **Action:** I cannot directly open the camera within this chat stream, but I will prepare the UI for you to use the 'Google Lens' button if you click 'Send' now!"
444
 
 
445
  # List of intents that indicate the response is sufficient for the tool/VQA action
446
  # and should bypass the short-response/low-confidence Google search fallback.
447
  TOOL_EXECUTION_INTENTS = [
 
451
  # If no download file was created AND the intent is NOT a tool/VQA intent, perform confidence check.
452
  if file_download_path is None and current_intent not in TOOL_EXECUTION_INTENTS:
453
  final_response = check_confidence_and_augment(final_response, original_message)
 
454
 
455
  audio_file_path = text_to_audio(final_response, is_voice_chat)
456
 
 
472
  def user_turn(user_message: str, chat_history: List[Dict[str, str]]) -> Tuple[str, List[Dict[str, str]]]:
473
  """Appends the user message to the chat history and clears the input box, using the 'messages' format."""
474
 
475
+ # If the user sends an empty message on a clear slate, do nothing
476
  if not user_message and not chat_history:
477
+ return "", chat_history
478
 
479
+ # If the last message is an incomplete assistant message (content == ""), and no NEW user message is provided, don't update.
480
+ # This prevents double submission issues if the generator is slow.
481
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "" and not user_message:
482
  return "", chat_history
483
 
484
+ if user_message or (not user_message and chat_history and chat_history[-1]['role'] == 'assistant'): # Only proceed if there's a message or we are in a follow-up state
485
+ # Append the new user message (or a dummy message if only image is sent)
486
+ final_user_message = user_message if user_message else "Analyzing Staged Media."
487
+ chat_history.append({"role": "user", "content": final_user_message})
488
+ # Append a placeholder for the assistant's response.
489
+ chat_history.append({"role": "assistant", "content": ""})
490
 
491
  return "", chat_history
492
 
493
+ def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
494
  """Stages the file path and updates the hint box."""
495
  if file_path:
496
+ # Note: file_path is a string path here
497
  return file_path, f"๐Ÿ“Ž File staged: {os.path.basename(file_path)}. Click send (โœˆ๏ธ) to analyze.", gr.update(value="", interactive=True), gr.update(interactive=False)
498
  return None, "File upload cancelled/cleared.", gr.update(value="", interactive=True), gr.update(interactive=False)
499
 
 
510
  last_user_prompt = ""
511
  for item in reversed(history):
512
  if item['role'] == 'user' and item['content']:
513
+ # Handle the VQA flow context update
514
+ last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
515
  break
516
 
517
  if not last_user_prompt:
 
534
  _, chat_history = user_turn(user_message, chat_history)
535
 
536
  # Update the last assistant response placeholder with a status message
537
+ if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
538
  chat_history[-1]['content'] = "๐Ÿ“ธ Preparing camera capture..."
539
 
540
  # Update UI to show the webcam (start capture simulation)
 
566
  gr.Column(min_width=1)
567
 
568
  # Chatbot Area
 
569
  chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
570
 
571
  # Webcam Capture Area (Hidden)
572
  with gr.Row(visible=False) as webcam_capture_row:
573
+ # Note: webcam_capture_component will output raw image data (NumPy array)
574
+ webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
575
  close_webcam_btn = gr.Button("โœ… Use this image")
576
 
577
  # Audio Recording Row (Hidden)
 
635
  queue=False
636
  )
637
 
638
+ # 5. Mic wiring
639
  mic_btn.click(
640
  fn=lambda: (gr.update(visible=False), gr.update(visible=True), "๐ŸŽ™๏ธ Recording..."),
641
  inputs=[],
642
  outputs=[input_row, audio_record_row, hint_box],
643
  queue=False
644
  ).then(
645
+ fn=simulate_recording_delay,
646
  inputs=[],
647
  outputs=[],
648
  queue=False,