Spaces:

cosmosai471
/

come_onnn

Running

App Files Files Community

cosmosai471 commited on 29 days ago

Commit

f6b95b3

verified ·

1 Parent(s): 95abdee

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -30

app.py CHANGED Viewed

@@ -138,12 +138,24 @@ def check_confidence_and_augment(raw_response: str, prompt: str) -> str:
     return final_response
-def process_image(image_path: str, message: str) -> str:
     """Uses the VLM pipeline (LLaVA) for Visual Question Answering (VQA)."""
     global image_pipe
-    if image_path and image_pipe:
-        try:
-            image = Image.open(image_path).convert("RGB")
             vqa_prompt = f"USER: {message}\nASSISTANT:"
             results = image_pipe(image, prompt=vqa_prompt)
@@ -152,9 +164,10 @@ def process_image(image_path: str, message: str) -> str:
             prompt_injection = f"**Image Analysis (VQA):** {vqa_response}\n\n**User Query:** {message}"
             return prompt_injection
-        except Exception as e:
-            print(f"Image Pipeline Error: {e}")
-            return f"[Image Processing Error: {e}] **User Query:** {message}"
     return message
@@ -218,6 +231,7 @@ INTENT_STATUS_MAP = {
 def get_intent_status(raw_response: str, is_vqa: bool) -> Tuple[str, str, str]:
     """Parses the Intent tag from the model's raw response and returns the intent, status, and cleaned response."""
     if is_vqa and "Image Analysis (VQA)" in raw_response:
         return "vqa", INTENT_STATUS_MAP["vqa"], raw_response
     match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
@@ -308,9 +322,7 @@ def generate_ppt_and_update_history(content: str, history: List[Dict[str, str]])
 # --- CORE GENERATOR FUNCTION ---
-# --- CORE GENERATOR FUNCTION ---
-def chat_generator(message: str, image_path: str, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
     """The main generator function for streaming the LLM response."""
     # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
@@ -325,17 +337,19 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
     last_user_index = len(history) - 2
     original_message = history[last_user_index]['content']
-    # Safely check if image_path contains a non-empty string path
-    is_vqa_flow = bool(image_path) and isinstance(image_path, str)
     if is_vqa_flow:
-        # Pre-process image/VQA
         message = process_image(image_path, original_message)
         # Update the user's content to reflect VQA flow for context building
         history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
     else:
         message = original_message
-        image_path = None
     # Build the prompt with conversation history (Context)
     prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
@@ -428,7 +442,6 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
     elif current_intent == "open_camera":
         final_response += "\n\n📸 **Action:** I cannot directly open the camera within this chat stream, but I will prepare the UI for you to use the 'Google Lens' button if you click 'Send' now!"
-    # --- FIX START ---
     # List of intents that indicate the response is sufficient for the tool/VQA action
     # and should bypass the short-response/low-confidence Google search fallback.
     TOOL_EXECUTION_INTENTS = [
@@ -438,7 +451,6 @@ def chat_generator(message: str, image_path: str, history: List[Dict[str, str]],
     # If no download file was created AND the intent is NOT a tool/VQA intent, perform confidence check.
     if file_download_path is None and current_intent not in TOOL_EXECUTION_INTENTS:
         final_response = check_confidence_and_augment(final_response, original_message)
-    # --- FIX END ---
     audio_file_path = text_to_audio(final_response, is_voice_chat)
@@ -460,24 +472,28 @@ def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, g
 def user_turn(user_message: str, chat_history: List[Dict[str, str]]) -> Tuple[str, List[Dict[str, str]]]:
     """Appends the user message to the chat history and clears the input box, using the 'messages' format."""
     if not user_message and not chat_history:
-        pass
-    # If the last message is an incomplete assistant message, and no new user message is provided, don't update
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "" and not user_message:
         return "", chat_history
-    if user_message:
-        # Append the new user message
-        chat_history.append({"role": "user", "content": user_message})
-        # Append a placeholder for the assistant's response. CHANGE: Use "" instead of None.
-        chat_history.append({"role": "assistant", "content": ""}) # <<< FIX APPLIED HERE
     return "", chat_history
-def stage_file_upload(file_path: str) -> Tuple[str, str, gr.update, gr.update]:
     """Stages the file path and updates the hint box."""
     if file_path:
         return file_path, f"📎 File staged: {os.path.basename(file_path)}. Click send (✈️) to analyze.", gr.update(value="", interactive=True), gr.update(interactive=False)
     return None, "File upload cancelled/cleared.", gr.update(value="", interactive=True), gr.update(interactive=False)
@@ -494,7 +510,8 @@ def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str
     last_user_prompt = ""
     for item in reversed(history):
         if item['role'] == 'user' and item['content']:
-            last_user_prompt = item['content'].split("**User Query:**")[-1].strip()
             break
     if not last_user_prompt:
@@ -517,7 +534,7 @@ def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]]) -
     _, chat_history = user_turn(user_message, chat_history)
     # Update the last assistant response placeholder with a status message
-    if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] is None:
         chat_history[-1]['content'] = "📸 Preparing camera capture..."
     # Update UI to show the webcam (start capture simulation)
@@ -549,12 +566,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
         gr.Column(min_width=1)
     # Chatbot Area
-    # --- FIX: Added type='messages' to comply with new Gradio standard ---
     chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
     # Webcam Capture Area (Hidden)
     with gr.Row(visible=False) as webcam_capture_row:
-        webcam_capture_component = gr.Image(sources=["webcam"], show_label=False)
         close_webcam_btn = gr.Button("✅ Use this image")
     # Audio Recording Row (Hidden)
@@ -618,14 +635,14 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
         queue=False
     )
-    # 5. Mic wiring (Fixed with simulate_recording_delay)
     mic_btn.click(
         fn=lambda: (gr.update(visible=False), gr.update(visible=True), "🎙️ Recording..."),
         inputs=[],
         outputs=[input_row, audio_record_row, hint_box],
         queue=False
     ).then(
-        fn=simulate_recording_delay, # <<< NEW STEP FOR DELAY
         inputs=[],
         outputs=[],
         queue=False,

     return final_response
+def process_image(image_data_or_path: Any, message: str) -> str:
     """Uses the VLM pipeline (LLaVA) for Visual Question Answering (VQA)."""
     global image_pipe
+    if image_pipe is None:
+        return f"[Image Processing Error: VLM model is not loaded.] **User Query:** {message}"
+    image = None
+    try:
+        # Check if it's a file path string
+        if isinstance(image_data_or_path, str):
+            image = Image.open(image_data_or_path).convert("RGB")
+        # Check if it's raw image data (e.g., NumPy array from webcam)
+        elif image_data_or_path is not None:
+            # Gradio often returns image data as a numpy array, which PIL can handle
+            image = Image.fromarray(image_data_or_path).convert("RGB")
+        if image:
             vqa_prompt = f"USER: {message}\nASSISTANT:"
             results = image_pipe(image, prompt=vqa_prompt)
             prompt_injection = f"**Image Analysis (VQA):** {vqa_response}\n\n**User Query:** {message}"
             return prompt_injection
+    except Exception as e:
+        print(f"Image Pipeline Error: {e}")
+        return f"[Image Processing Error: {e}] **User Query:** {message}"
     return message
 def get_intent_status(raw_response: str, is_vqa: bool) -> Tuple[str, str, str]:
     """Parses the Intent tag from the model's raw response and returns the intent, status, and cleaned response."""
     if is_vqa and "Image Analysis (VQA)" in raw_response:
+        # If we have VQA content in the response, treat the intent as vqa
         return "vqa", INTENT_STATUS_MAP["vqa"], raw_response
     match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
 # --- CORE GENERATOR FUNCTION ---
+def chat_generator(message: str, image_path: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
     """The main generator function for streaming the LLM response."""
     # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
     last_user_index = len(history) - 2
     original_message = history[last_user_index]['content']
+    # FIX: Robust check for image/file presence. Avoids ambiguous truth value error.
+    # An image is staged if image_path is not None and not an empty string.
+    # We allow image_path to be raw data (like a NumPy array) or a string path.
+    is_vqa_flow = image_path is not None and image_path != ""
     if is_vqa_flow:
+        # Process image/VQA
         message = process_image(image_path, original_message)
         # Update the user's content to reflect VQA flow for context building
         history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
     else:
         message = original_message
+        image_path = None # Clear image_path for final yield
     # Build the prompt with conversation history (Context)
     prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
     elif current_intent == "open_camera":
         final_response += "\n\n📸 **Action:** I cannot directly open the camera within this chat stream, but I will prepare the UI for you to use the 'Google Lens' button if you click 'Send' now!"
     # List of intents that indicate the response is sufficient for the tool/VQA action
     # and should bypass the short-response/low-confidence Google search fallback.
     TOOL_EXECUTION_INTENTS = [
     # If no download file was created AND the intent is NOT a tool/VQA intent, perform confidence check.
     if file_download_path is None and current_intent not in TOOL_EXECUTION_INTENTS:
         final_response = check_confidence_and_augment(final_response, original_message)
     audio_file_path = text_to_audio(final_response, is_voice_chat)
 def user_turn(user_message: str, chat_history: List[Dict[str, str]]) -> Tuple[str, List[Dict[str, str]]]:
     """Appends the user message to the chat history and clears the input box, using the 'messages' format."""
+    # If the user sends an empty message on a clear slate, do nothing
     if not user_message and not chat_history:
+        return "", chat_history
+    # If the last message is an incomplete assistant message (content == ""), and no NEW user message is provided, don't update.
+    # This prevents double submission issues if the generator is slow.
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "" and not user_message:
         return "", chat_history
+    if user_message or (not user_message and chat_history and chat_history[-1]['role'] == 'assistant'): # Only proceed if there's a message or we are in a follow-up state
+        # Append the new user message (or a dummy message if only image is sent)
+        final_user_message = user_message if user_message else "Analyzing Staged Media."
+        chat_history.append({"role": "user", "content": final_user_message})
+        # Append a placeholder for the assistant's response.
+        chat_history.append({"role": "assistant", "content": ""})
     return "", chat_history
+def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
     """Stages the file path and updates the hint box."""
     if file_path:
+        # Note: file_path is a string path here
         return file_path, f"📎 File staged: {os.path.basename(file_path)}. Click send (✈️) to analyze.", gr.update(value="", interactive=True), gr.update(interactive=False)
     return None, "File upload cancelled/cleared.", gr.update(value="", interactive=True), gr.update(interactive=False)
     last_user_prompt = ""
     for item in reversed(history):
         if item['role'] == 'user' and item['content']:
+            # Handle the VQA flow context update
+            last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
             break
     if not last_user_prompt:
     _, chat_history = user_turn(user_message, chat_history)
     # Update the last assistant response placeholder with a status message
+    if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
         chat_history[-1]['content'] = "📸 Preparing camera capture..."
     # Update UI to show the webcam (start capture simulation)
         gr.Column(min_width=1)
     # Chatbot Area
     chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
     # Webcam Capture Area (Hidden)
     with gr.Row(visible=False) as webcam_capture_row:
+        # Note: webcam_capture_component will output raw image data (NumPy array)
+        webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
         close_webcam_btn = gr.Button("✅ Use this image")
     # Audio Recording Row (Hidden)
         queue=False
     )
+    # 5. Mic wiring
     mic_btn.click(
         fn=lambda: (gr.update(visible=False), gr.update(visible=True), "🎙️ Recording..."),
         inputs=[],
         outputs=[input_row, audio_record_row, hint_box],
         queue=False
     ).then(
+        fn=simulate_recording_delay,
         inputs=[],
         outputs=[],
         queue=False,