Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -138,12 +138,24 @@ def check_confidence_and_augment(raw_response: str, prompt: str) -> str:
|
|
| 138 |
|
| 139 |
return final_response
|
| 140 |
|
| 141 |
-
def process_image(
|
| 142 |
"""Uses the VLM pipeline (LLaVA) for Visual Question Answering (VQA)."""
|
| 143 |
global image_pipe
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
vqa_prompt = f"USER: {message}\nASSISTANT:"
|
| 148 |
|
| 149 |
results = image_pipe(image, prompt=vqa_prompt)
|
|
@@ -152,9 +164,10 @@ def process_image(image_path: str, message: str) -> str:
|
|
| 152 |
|
| 153 |
prompt_injection = f"**Image Analysis (VQA):** {vqa_response}\n\n**User Query:** {message}"
|
| 154 |
return prompt_injection
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
| 158 |
|
| 159 |
return message
|
| 160 |
|
|
@@ -218,6 +231,7 @@ INTENT_STATUS_MAP = {
|
|
| 218 |
def get_intent_status(raw_response: str, is_vqa: bool) -> Tuple[str, str, str]:
|
| 219 |
"""Parses the Intent tag from the model's raw response and returns the intent, status, and cleaned response."""
|
| 220 |
if is_vqa and "Image Analysis (VQA)" in raw_response:
|
|
|
|
| 221 |
return "vqa", INTENT_STATUS_MAP["vqa"], raw_response
|
| 222 |
|
| 223 |
match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
|
|
@@ -308,9 +322,7 @@ def generate_ppt_and_update_history(content: str, history: List[Dict[str, str]])
|
|
| 308 |
|
| 309 |
# --- CORE GENERATOR FUNCTION ---
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
def chat_generator(message: str, image_path: str, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
|
| 314 |
"""The main generator function for streaming the LLM response."""
|
| 315 |
|
| 316 |
# Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
|
|
@@ -325,17 +337,19 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
|
|
| 325 |
last_user_index = len(history) - 2
|
| 326 |
original_message = history[last_user_index]['content']
|
| 327 |
|
| 328 |
-
#
|
| 329 |
-
|
|
|
|
|
|
|
| 330 |
|
| 331 |
if is_vqa_flow:
|
| 332 |
-
#
|
| 333 |
message = process_image(image_path, original_message)
|
| 334 |
# Update the user's content to reflect VQA flow for context building
|
| 335 |
history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
|
| 336 |
else:
|
| 337 |
message = original_message
|
| 338 |
-
image_path = None
|
| 339 |
|
| 340 |
# Build the prompt with conversation history (Context)
|
| 341 |
prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
|
|
@@ -428,7 +442,6 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
|
|
| 428 |
elif current_intent == "open_camera":
|
| 429 |
final_response += "\n\n๐ธ **Action:** I cannot directly open the camera within this chat stream, but I will prepare the UI for you to use the 'Google Lens' button if you click 'Send' now!"
|
| 430 |
|
| 431 |
-
# --- FIX START ---
|
| 432 |
# List of intents that indicate the response is sufficient for the tool/VQA action
|
| 433 |
# and should bypass the short-response/low-confidence Google search fallback.
|
| 434 |
TOOL_EXECUTION_INTENTS = [
|
|
@@ -438,7 +451,6 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
|
|
| 438 |
# If no download file was created AND the intent is NOT a tool/VQA intent, perform confidence check.
|
| 439 |
if file_download_path is None and current_intent not in TOOL_EXECUTION_INTENTS:
|
| 440 |
final_response = check_confidence_and_augment(final_response, original_message)
|
| 441 |
-
# --- FIX END ---
|
| 442 |
|
| 443 |
audio_file_path = text_to_audio(final_response, is_voice_chat)
|
| 444 |
|
|
@@ -460,24 +472,28 @@ def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, g
|
|
| 460 |
def user_turn(user_message: str, chat_history: List[Dict[str, str]]) -> Tuple[str, List[Dict[str, str]]]:
|
| 461 |
"""Appends the user message to the chat history and clears the input box, using the 'messages' format."""
|
| 462 |
|
|
|
|
| 463 |
if not user_message and not chat_history:
|
| 464 |
-
|
| 465 |
|
| 466 |
-
# If the last message is an incomplete assistant message, and no
|
|
|
|
| 467 |
if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "" and not user_message:
|
| 468 |
return "", chat_history
|
| 469 |
|
| 470 |
-
if user_message:
|
| 471 |
-
# Append the new user message
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
|
|
|
| 475 |
|
| 476 |
return "", chat_history
|
| 477 |
|
| 478 |
-
def stage_file_upload(file_path: str) -> Tuple[
|
| 479 |
"""Stages the file path and updates the hint box."""
|
| 480 |
if file_path:
|
|
|
|
| 481 |
return file_path, f"๐ File staged: {os.path.basename(file_path)}. Click send (โ๏ธ) to analyze.", gr.update(value="", interactive=True), gr.update(interactive=False)
|
| 482 |
return None, "File upload cancelled/cleared.", gr.update(value="", interactive=True), gr.update(interactive=False)
|
| 483 |
|
|
@@ -494,7 +510,8 @@ def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str
|
|
| 494 |
last_user_prompt = ""
|
| 495 |
for item in reversed(history):
|
| 496 |
if item['role'] == 'user' and item['content']:
|
| 497 |
-
|
|
|
|
| 498 |
break
|
| 499 |
|
| 500 |
if not last_user_prompt:
|
|
@@ -517,7 +534,7 @@ def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]]) -
|
|
| 517 |
_, chat_history = user_turn(user_message, chat_history)
|
| 518 |
|
| 519 |
# Update the last assistant response placeholder with a status message
|
| 520 |
-
if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content']
|
| 521 |
chat_history[-1]['content'] = "๐ธ Preparing camera capture..."
|
| 522 |
|
| 523 |
# Update UI to show the webcam (start capture simulation)
|
|
@@ -549,12 +566,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
|
|
| 549 |
gr.Column(min_width=1)
|
| 550 |
|
| 551 |
# Chatbot Area
|
| 552 |
-
# --- FIX: Added type='messages' to comply with new Gradio standard ---
|
| 553 |
chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
|
| 554 |
|
| 555 |
# Webcam Capture Area (Hidden)
|
| 556 |
with gr.Row(visible=False) as webcam_capture_row:
|
| 557 |
-
webcam_capture_component
|
|
|
|
| 558 |
close_webcam_btn = gr.Button("โ
Use this image")
|
| 559 |
|
| 560 |
# Audio Recording Row (Hidden)
|
|
@@ -618,14 +635,14 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
|
|
| 618 |
queue=False
|
| 619 |
)
|
| 620 |
|
| 621 |
-
# 5. Mic wiring
|
| 622 |
mic_btn.click(
|
| 623 |
fn=lambda: (gr.update(visible=False), gr.update(visible=True), "๐๏ธ Recording..."),
|
| 624 |
inputs=[],
|
| 625 |
outputs=[input_row, audio_record_row, hint_box],
|
| 626 |
queue=False
|
| 627 |
).then(
|
| 628 |
-
fn=simulate_recording_delay,
|
| 629 |
inputs=[],
|
| 630 |
outputs=[],
|
| 631 |
queue=False,
|
|
|
|
| 138 |
|
| 139 |
return final_response
|
| 140 |
|
| 141 |
+
def process_image(image_data_or_path: Any, message: str) -> str:
|
| 142 |
"""Uses the VLM pipeline (LLaVA) for Visual Question Answering (VQA)."""
|
| 143 |
global image_pipe
|
| 144 |
+
|
| 145 |
+
if image_pipe is None:
|
| 146 |
+
return f"[Image Processing Error: VLM model is not loaded.] **User Query:** {message}"
|
| 147 |
+
|
| 148 |
+
image = None
|
| 149 |
+
try:
|
| 150 |
+
# Check if it's a file path string
|
| 151 |
+
if isinstance(image_data_or_path, str):
|
| 152 |
+
image = Image.open(image_data_or_path).convert("RGB")
|
| 153 |
+
# Check if it's raw image data (e.g., NumPy array from webcam)
|
| 154 |
+
elif image_data_or_path is not None:
|
| 155 |
+
# Gradio often returns image data as a numpy array, which PIL can handle
|
| 156 |
+
image = Image.fromarray(image_data_or_path).convert("RGB")
|
| 157 |
+
|
| 158 |
+
if image:
|
| 159 |
vqa_prompt = f"USER: {message}\nASSISTANT:"
|
| 160 |
|
| 161 |
results = image_pipe(image, prompt=vqa_prompt)
|
|
|
|
| 164 |
|
| 165 |
prompt_injection = f"**Image Analysis (VQA):** {vqa_response}\n\n**User Query:** {message}"
|
| 166 |
return prompt_injection
|
| 167 |
+
|
| 168 |
+
except Exception as e:
|
| 169 |
+
print(f"Image Pipeline Error: {e}")
|
| 170 |
+
return f"[Image Processing Error: {e}] **User Query:** {message}"
|
| 171 |
|
| 172 |
return message
|
| 173 |
|
|
|
|
| 231 |
def get_intent_status(raw_response: str, is_vqa: bool) -> Tuple[str, str, str]:
|
| 232 |
"""Parses the Intent tag from the model's raw response and returns the intent, status, and cleaned response."""
|
| 233 |
if is_vqa and "Image Analysis (VQA)" in raw_response:
|
| 234 |
+
# If we have VQA content in the response, treat the intent as vqa
|
| 235 |
return "vqa", INTENT_STATUS_MAP["vqa"], raw_response
|
| 236 |
|
| 237 |
match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
|
|
|
|
| 322 |
|
| 323 |
# --- CORE GENERATOR FUNCTION ---
|
| 324 |
|
| 325 |
+
def chat_generator(message: str, image_path: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
|
|
|
|
|
|
|
| 326 |
"""The main generator function for streaming the LLM response."""
|
| 327 |
|
| 328 |
# Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
|
|
|
|
| 337 |
last_user_index = len(history) - 2
|
| 338 |
original_message = history[last_user_index]['content']
|
| 339 |
|
| 340 |
+
# FIX: Robust check for image/file presence. Avoids ambiguous truth value error.
|
| 341 |
+
# An image is staged if image_path is not None and not an empty string.
|
| 342 |
+
# We allow image_path to be raw data (like a NumPy array) or a string path.
|
| 343 |
+
is_vqa_flow = image_path is not None and image_path != ""
|
| 344 |
|
| 345 |
if is_vqa_flow:
|
| 346 |
+
# Process image/VQA
|
| 347 |
message = process_image(image_path, original_message)
|
| 348 |
# Update the user's content to reflect VQA flow for context building
|
| 349 |
history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
|
| 350 |
else:
|
| 351 |
message = original_message
|
| 352 |
+
image_path = None # Clear image_path for final yield
|
| 353 |
|
| 354 |
# Build the prompt with conversation history (Context)
|
| 355 |
prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
|
|
|
|
| 442 |
elif current_intent == "open_camera":
|
| 443 |
final_response += "\n\n๐ธ **Action:** I cannot directly open the camera within this chat stream, but I will prepare the UI for you to use the 'Google Lens' button if you click 'Send' now!"
|
| 444 |
|
|
|
|
| 445 |
# List of intents that indicate the response is sufficient for the tool/VQA action
|
| 446 |
# and should bypass the short-response/low-confidence Google search fallback.
|
| 447 |
TOOL_EXECUTION_INTENTS = [
|
|
|
|
| 451 |
# If no download file was created AND the intent is NOT a tool/VQA intent, perform confidence check.
|
| 452 |
if file_download_path is None and current_intent not in TOOL_EXECUTION_INTENTS:
|
| 453 |
final_response = check_confidence_and_augment(final_response, original_message)
|
|
|
|
| 454 |
|
| 455 |
audio_file_path = text_to_audio(final_response, is_voice_chat)
|
| 456 |
|
|
|
|
| 472 |
def user_turn(user_message: str, chat_history: List[Dict[str, str]]) -> Tuple[str, List[Dict[str, str]]]:
|
| 473 |
"""Appends the user message to the chat history and clears the input box, using the 'messages' format."""
|
| 474 |
|
| 475 |
+
# If the user sends an empty message on a clear slate, do nothing
|
| 476 |
if not user_message and not chat_history:
|
| 477 |
+
return "", chat_history
|
| 478 |
|
| 479 |
+
# If the last message is an incomplete assistant message (content == ""), and no NEW user message is provided, don't update.
|
| 480 |
+
# This prevents double submission issues if the generator is slow.
|
| 481 |
if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "" and not user_message:
|
| 482 |
return "", chat_history
|
| 483 |
|
| 484 |
+
if user_message or (not user_message and chat_history and chat_history[-1]['role'] == 'assistant'): # Only proceed if there's a message or we are in a follow-up state
|
| 485 |
+
# Append the new user message (or a dummy message if only image is sent)
|
| 486 |
+
final_user_message = user_message if user_message else "Analyzing Staged Media."
|
| 487 |
+
chat_history.append({"role": "user", "content": final_user_message})
|
| 488 |
+
# Append a placeholder for the assistant's response.
|
| 489 |
+
chat_history.append({"role": "assistant", "content": ""})
|
| 490 |
|
| 491 |
return "", chat_history
|
| 492 |
|
| 493 |
+
def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
|
| 494 |
"""Stages the file path and updates the hint box."""
|
| 495 |
if file_path:
|
| 496 |
+
# Note: file_path is a string path here
|
| 497 |
return file_path, f"๐ File staged: {os.path.basename(file_path)}. Click send (โ๏ธ) to analyze.", gr.update(value="", interactive=True), gr.update(interactive=False)
|
| 498 |
return None, "File upload cancelled/cleared.", gr.update(value="", interactive=True), gr.update(interactive=False)
|
| 499 |
|
|
|
|
| 510 |
last_user_prompt = ""
|
| 511 |
for item in reversed(history):
|
| 512 |
if item['role'] == 'user' and item['content']:
|
| 513 |
+
# Handle the VQA flow context update
|
| 514 |
+
last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
|
| 515 |
break
|
| 516 |
|
| 517 |
if not last_user_prompt:
|
|
|
|
| 534 |
_, chat_history = user_turn(user_message, chat_history)
|
| 535 |
|
| 536 |
# Update the last assistant response placeholder with a status message
|
| 537 |
+
if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
|
| 538 |
chat_history[-1]['content'] = "๐ธ Preparing camera capture..."
|
| 539 |
|
| 540 |
# Update UI to show the webcam (start capture simulation)
|
|
|
|
| 566 |
gr.Column(min_width=1)
|
| 567 |
|
| 568 |
# Chatbot Area
|
|
|
|
| 569 |
chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
|
| 570 |
|
| 571 |
# Webcam Capture Area (Hidden)
|
| 572 |
with gr.Row(visible=False) as webcam_capture_row:
|
| 573 |
+
# Note: webcam_capture_component will output raw image data (NumPy array)
|
| 574 |
+
webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
|
| 575 |
close_webcam_btn = gr.Button("โ
Use this image")
|
| 576 |
|
| 577 |
# Audio Recording Row (Hidden)
|
|
|
|
| 635 |
queue=False
|
| 636 |
)
|
| 637 |
|
| 638 |
+
# 5. Mic wiring
|
| 639 |
mic_btn.click(
|
| 640 |
fn=lambda: (gr.update(visible=False), gr.update(visible=True), "๐๏ธ Recording..."),
|
| 641 |
inputs=[],
|
| 642 |
outputs=[input_row, audio_record_row, hint_box],
|
| 643 |
queue=False
|
| 644 |
).then(
|
| 645 |
+
fn=simulate_recording_delay,
|
| 646 |
inputs=[],
|
| 647 |
outputs=[],
|
| 648 |
queue=False,
|