Spaces:

cosmosai471
/

come_onnn

Running

App Files Files Community

cosmosai471 commited on Oct 28

Commit

4f7f656

verified ·

1 Parent(s): b27fc81

Update app.py

Browse files

Files changed (1) hide show

app.py +252 -395

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import os
 import time
@@ -9,41 +10,27 @@ from llama_cpp import Llama
 from typing import List, Dict, Any, Tuple
 from PIL import Image
 from transformers import pipeline
-from gtts import gTTS
 from diffusers import StableDiffusionPipeline
-from docx import Document
-from pptx import Presentation
-from io import BytesIO
-# --- CONFIGURATION & INITIALIZATION ---
-# Set device for pipelines (STT/VQA/ImageGen). Use "cpu" for compatibility.
-STT_DEVICE = "cpu"
 os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
 AUDIO_DIR = "audio_outputs"
-DOC_DIR = "doc_outputs"
 if not os.path.exists(AUDIO_DIR):
     os.makedirs(AUDIO_DIR)
 if not os.path.exists(DOC_DIR):
     os.makedirs(DOC_DIR)
-# Hugging Face Model Info
 REPO_ID = "cosmosai471/Luna-v3"
 MODEL_FILE = "luna.gguf"
-LOCAL_MODEL_PATH = MODEL_FILE
-# FIX: Updated SYSTEM PROMPT for Confidence Scoring
 SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
-# Helper to safely delete Llama instance (prevents resource leaks)
-def safe_del(self):
-    try:
-        if hasattr(self, "close") and callable(self.close):
-            self.close()
-    except Exception:
-        pass
-Llama.__del__ = safe_del
-# --- MODEL LOADING ---
 llm = None
 try:
     print(f"Downloading {MODEL_FILE} from {REPO_ID}...")
@@ -54,10 +41,10 @@ try:
     print("Initializing Llama...")
     llm = Llama(
         model_path=LOCAL_MODEL_PATH,
-        n_ctx=8192,
-        n_threads=4,
-        n_batch=256,
-        n_gpu_layers=0,
         verbose=False
     )
     print("✅ Luna Model loaded successfully!")
@@ -65,11 +52,9 @@ except Exception as e:
     print(f"❌ Error loading Luna model: {e}")
     class DummyLLM:
         def create_completion(self, *args, **kwargs):
-            # Must match the new prompt format to avoid parsing errors
             yield {'choices': [{'text': '[Intent: qa_general][Confidence: 0] ERROR: Luna model failed to load. Check logs and resources.'}]}
     llm = DummyLLM()
-# --- MULTIMODAL PIPELINE LOADING ---
 stt_pipe = None
 try:
     stt_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=STT_DEVICE)
@@ -79,7 +64,7 @@ except Exception as e:
 image_pipe = None
 try:
-    VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
     image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
     print(f"✅ Loaded {VLM_MODEL_ID} for image processing.")
 except Exception as e:
@@ -88,148 +73,139 @@ except Exception as e:
 img_gen_pipe = None
 try:
     img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
-    img_gen_pipe.to(STT_DEVICE)
     print("✅ Loaded Stable Diffusion (v1-5) for image generation.")
 except Exception as e:
     print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
 # --- UTILITY FUNCTIONS ---
 def simulate_recording_delay():
-    """Simulates a 3-second recording time for the UI flow."""
     time.sleep(3)
-    return None
 def clean_response_stream(raw_text: str) -> str:
-    """Cleans up raw LLaMA-style output and removes repeats, and removes tags."""
-    # 1. Strip stop tokens
     clean_text = re.split(r'\nUser:|\nAssistant:|</s>|Intent|Action', raw_text, 1)[0].strip()
-    # 2. Remove instruction/action markers and new Confidence/Intent tags
     clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
     clean_text = re.sub(r'\[Intent:\s*\w+\]|\[Confidence:\s*\d+\]', '', clean_text).strip()
-    # 3. Simple word-repeat check
     words = clean_text.split()
-    if len(words) > 4 and words[-2:] == words[-4:-2]:
         clean_text = ' '.join(words[:-2])
     return clean_text
 def web_search_tool(query: str) -> str:
-    """Simulated Google Search Fallback."""
-    time.sleep(1.5)
     print(f"Simulating Google Search fallback for: {query}")
     return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
-# FIX: Updated confidence check
 def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
     """
-    Checks the model's self-reported confidence from the *raw* response.
-    Triggers fallback if low.
     """
-    # 1. Parse Confidence Score from the raw, unprocessed response
     confidence_match = re.search(r'\[Confidence:\s*(\d+)\]', raw_response_with_tags)
     confidence_score = int(confidence_match.group(1)) if confidence_match else 0
-    # 2. Clean the response *after* parsing confidence
     cleaned_response = clean_response_stream(raw_response_with_tags)
-    # 3. Check if confidence is below threshold
-    if confidence_score < 33:
         print(f"Low confidence ({confidence_score}%) detected. Triggering Google Search fallback.")
         search_snippet = web_search_tool(prompt)
         if "error" in cleaned_response.lower() or confidence_score == 0:
              final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
         else:
             final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
     else:
-        # Confidence is high, just return the cleaned response
         final_response = cleaned_response
     return final_response
-# FIX: Updated image processing with correct VQA prompt
-def process_image(image_data_or_path: Any, message: str) -> str:
-    """Uses the VLM pipeline (LLaVA) for Visual Question Answering (VQA)."""
     global image_pipe
     if image_pipe is None:
-        return f"[Image Processing Error: VLM model is not loaded.] **User Query:** {message}"
     image = None
     try:
         if isinstance(image_data_or_path, str):
             image = Image.open(image_data_or_path).convert("RGB")
-        elif image_data_or_path is not None:
             image = Image.fromarray(image_data_or_path).convert("RGB")
         if image:
-            # FIX: Use the special <image> token for the llava-1.5-hf pipeline
             vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
-            results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 768})
-            # The VLM's *full* response is in 'generated_text', including the prompt
-            raw_vlm_output = results[0]['generated_text'] if results else "The image could not be processed."
             # Extract just the assistant's part
             vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
             del image
-            # This is the VQA analysis that will be fed into Luna
             prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
-            return prompt_injection
     except Exception as e:
         print(f"Image Pipeline Error: {e}")
-        return f"[Image Processing Error: {e}] **User Query:** {message}"
-    return message
 def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
-    """Transcribes audio file using Whisper."""
     if stt_pipe is None or audio_file_path is None:
         error_msg = "Error: Whisper model failed to load or no audio recorded."
         return "", error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), False, gr.update(visible=False)
     try:
         transcribed_text = stt_pipe(audio_file_path)["text"]
         new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
         return (
-            transcribed_text.strip(),
-            f"🎙️ Transcribed: '{transcribed_text.strip()}'",
-            gr.update(interactive=True),
-            new_button_update,
-            True,
             gr.update(visible=False)
         )
     except Exception as e:
         error_msg = f"Transcription Error: {e}"
         return "", error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), False, gr.update(visible=False)
 def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
-    """Converts the final response text to an MP3 file using gTTS."""
     if not is_voice_chat:
-        return None
-    clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$', '', text, flags=re.DOTALL)
     if len(clean_text.strip()) > 5:
         try:
             audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
             tts = gTTS(text=clean_text.strip(), lang='en')
             tts.save(audio_output_path)
-            return audio_output_path
         except Exception as e:
             print(f"gTTS Error: {e}")
             return None
     return None
-# Intent and Dynamic Hint Logic
 INTENT_STATUS_MAP = {
     "code_generate": "Analyzing requirements and drafting code 💻...",
     "code_explain": "Reviewing code logic and writing explanation 💡...",
@@ -244,494 +220,375 @@ INTENT_STATUS_MAP = {
     "default": "Luna is thinking...",
 }
-def get_intent_status(raw_response: str, is_vqa: bool) -> Tuple[str, str, str]:
-    """Parses the Intent tag from the model's raw response and returns the intent, status, and cleaned response."""
     # 1. Parse Intent
     match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
     intent = match.group(1).lower() if match else "default"
-    # If it was a VQA flow (image was sent), we force the VQA intent
-    # This ensures the VQA tool override works even if Luna misidentifies the intent
-    if is_vqa:
         intent = "vqa"
     # 2. Clean Text (remove both tags for display)
     cleaned_text = re.sub(r'\[Intent:\s*\w+\]\s*', '', raw_response, count=1).strip()
     cleaned_text = re.sub(r'\[Confidence:\s*\d+\]\s*', '', cleaned_text, count=1).strip()
     # 3. Get Status
     status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
     return intent, status, cleaned_text
-# --- NEW GENERATOR FUNCTIONS FOR UPGRADES ---
 def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
     """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
     file_path = None
     try:
         if file_type == "image":
-            if img_gen_pipe is None:
-                raise RuntimeError("Image generation model is not loaded.")
             image = img_gen_pipe(content).images[0]
             file_filename = f"generated_img_{random.randint(1000, 9999)}.png"
             file_path = os.path.join(DOC_DIR, file_filename)
             image.save(file_path)
-            # FIX: Format output as Gradio markdown file link
             display_content = f"🖼️ **Image Generated!**\n\n[Download {file_filename}](file={file_path})"
         elif file_type == "doc":
             doc = Document()
             doc.add_heading('Luna Generated Document', 0)
             doc.add_paragraph(content)
             file_filename = f"generated_doc_{random.randint(1000, 9999)}.docx"
             file_path = os.path.join(DOC_DIR, file_filename)
             doc.save(file_path)
             display_content = f"📄 **Document Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
         elif file_type == "ppt":
             prs = Presentation()
-            title_slide_layout = prs.slide_layouts[0]
-            slide = prs.slides.add_slide(title_slide_layout)
             slide.shapes.title.text = "Luna Generated Presentation"
-            subtitle = slide.placeholders[1]
-            subtitle.text = content[:100] + "..."
             file_filename = f"generated_ppt_{random.randint(1000, 9999)}.pptx"
             file_path = os.path.join(DOC_DIR, file_filename)
             prs.save(file_path)
             display_content = f"📊 **Presentation Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
         else:
             raise ValueError(f"Unknown file type: {file_type}")
-        # Update the history with the markdown link
         history[-1]['content'] = display_content
     except Exception as e:
-        error_msg = f"❌ **Error generating {file_type.upper()}:** {e}. Please check model loading or library installation."
         history[-1]['content'] = error_msg
-        file_path = None # Ensure path is None on failure
     return history, file_path
 # --- CORE GENERATOR FUNCTION ---
-def chat_generator(message: str, image_path: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
-    """The main generator function for streaming the LLM response."""
     # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
     # 1. INITIAL HISTORY CHECK
     if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
-        yield history, False, "Error: Generator called without a recent user message in history.", gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), None, False, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
         return
     # 2. PRE-PROCESSING & CONTEXT
     last_user_index = len(history) - 2
-    original_message = history[last_user_index]['content']
-    # FIX: Robust check for image/file presence.
-    # This avoids the ValueError: `array == ""`
     is_vqa_flow = False
-    if isinstance(image_path, str):
-        is_vqa_flow = image_path != ""
-    else:
-        # It's not a string, so if it's not None, it's image data (e.g., numpy array)
-        is_vqa_flow = image_path is not None
     if is_vqa_flow:
         # Process image/VQA
-        message = process_image(image_path, original_message)
-        # Update the user's content to reflect VQA flow for context building
         history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
     else:
-        message = original_message
-        image_path = None # Clear image_path for final yield
-    # Build the prompt with conversation history (Context)
     prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
-    for i, item in enumerate(history[:-1]):
         role = item['role'].upper()
         content = item['content'] if item['content'] is not None else ""
-        if role == "ASSISTANT":
-            prompt += f"LUNA: {content}\n"
-        elif role == "USER":
-            prompt += f"USER: {content}\n"
-    prompt += f"USER: {message}\nLUNA: "
-    # 3. HINT BOX & STREAM START
-    hint_text = "✨ Luna is starting to think..."
-    history[-1]['content'] = ""
-    yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
-    time.sleep(0.5)
     # 4. DIRECT STREAMING
     full_response = ""
-    current_intent = "default"
     try:
         stream = llm.create_completion(
-            prompt=prompt,
-            max_tokens=8192,
             stop=["USER:", "SYSTEM:", "</s>"],
-            echo=False,
-            stream=True,
-            temperature=0.7
         )
     except Exception as e:
         error_text = f"❌ Error generating response: {e}"
         history[-1]['content'] = error_text
-        yield history, False, error_text, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), None, False, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
         return
     try:
         for output in stream:
             token = output["choices"][0].get("text", "")
             full_response += token
-            # Get intent and cleaned text for display
-            current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow)
-            # Update the last assistant message's content
-            history[-1]['content'] = display_text
-            # Yield continuous update
-            yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
     except Exception as e:
-        _, _, final_response_text = get_intent_status(full_response, is_vqa_flow)
         error_msg = f"⚠️ Streaming interrupted: {e}"
-        history[-1]['content'] = final_response_text
-        yield history, False, error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), None, False, gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None)
         return
     # 5. POST-PROCESSING & TOOL EXECUTION
-    # We use the *full_response* (with tags) for confidence check
-    # We use the *current_intent* (parsed during stream) for tool logic
     file_download_path = None
-    # 5a. File Generation/Tool Action
     if current_intent == "image_generate":
-        yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
-        _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow)
         history, file_download_path = generate_file_content(content_for_tool, history, "image")
-        final_response = history[-1]['content']
     elif current_intent == "doc_generate":
-        yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
-        _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow)
         history, file_download_path = generate_file_content(content_for_tool, history, "doc")
-        final_response = history[-1]['content']
     elif current_intent == "ppt_generate":
-        yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
-        _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow)
         history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
-        final_response = history[-1]['content']
     elif current_intent == "open_google":
-        _, _, final_response = get_intent_status(full_response, is_vqa_flow)
-        final_response += "\n\n🔗 **Action:** Since I cannot open a window for you, click here to search Google for this topic: [Google Search Link](https://www.google.com/search?q=open+google+simulated+search)"
     elif current_intent == "open_camera":
-        _, _, final_response = get_intent_status(full_response, is_vqa_flow)
-        final_response += "\n\n📸 **Action:** I cannot directly open the camera within this chat stream, but I will prepare the UI for you to use the 'Google Lens' button if you click 'Send' now!"
-    # 5b. Confidence Check and Augmentation (Bypassed by Intent)
-    TOOL_EXECUTION_INTENTS = [
-        "image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"
-    ]
-    # FIX: Check if intent is NOT a tool intent, then check confidence
-    if current_intent not in TOOL_EXECUTION_INTENTS:
-        # We pass the *full_response* (with tags) to the confidence checker
-        final_response = check_confidence_and_augment(full_response, original_message)
     else:
-        # If it *is* a tool intent, we just clean the response (unless it was already cleaned by a file generator)
-        if file_download_path is None:
-             _, _, final_response = get_intent_status(full_response, is_vqa_flow)
     # 5c. TTS Generation
-    audio_file_path = text_to_audio(final_response, is_voice_chat)
-    # 5d. Final History Update
-    history[-1]['content'] = final_response
     # 6. FINAL YIELD
-    hint = "✅ Response generated."
-    # We clear the staged image here by outputting None to its state component
-    yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(value=None), file_download_path
 # --- GRADIO WRAPPERS FOR UI ACTIONS ---
 def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
-    """Toggles the visibility of the media options menu."""
-    new_visibility = not current_visibility
     return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "➕")
-def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image: Any) -> Tuple[str, List[Dict[str, str]], Any]:
     """
-    Appends the user message to the chat history and clears the input box.
-    Crucially, it only adds a message if there is text OR a staged image.
-    It also clears the staged image *from the state* immediately, so it's not sticky.
     """
-    # Check if there is any input (text or image)
     has_text = bool(user_message)
-    # Check for image (robustly)
     has_image = False
-    if isinstance(staged_image, str):
-        has_image = staged_image != ""
     else:
-        has_image = staged_image is not None
-    # If no text AND no image, do nothing.
     if not has_text and not has_image:
-        return "", chat_history, staged_image # Return original state
-    # If the last message is an incomplete assistant message, wait for it to finish.
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
-        return user_message, chat_history, staged_image # Return original state, don't clear text yet
-    # We have a valid submission (text, image, or both)
-    # If no text was provided but an image was, create a default message
     if not has_text and has_image:
-        user_message = "Analyzing Staged Media."
-    chat_history.append({"role": "user", "content": user_message})
-    chat_history.append({"role": "assistant", "content": ""})
-    # FIX: Clear the text box and CLEAR THE STAGED IMAGE STATE
-    return "", chat_history, None
 def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
-    """Stages the file path and updates the hint box."""
     if file_path:
-        # file_path is a string path here
-        return file_path, f"📎 File staged: {os.path.basename(file_path)}. Click send (✈️) to analyze.", gr.update(value="", interactive=True), gr.update(interactive=False)
-    return None, "File upload cancelled/cleared.", gr.update(value="", interactive=True), gr.update(interactive=False)
 def clear_staged_media() -> gr.update:
-    """Clears the staged media state after sending or canceling."""
-    # This function is now redundant because user_turn handles clearing,
-    # but we will keep it for the explicit .then() calls if needed, though they are removed.
     return gr.update(value=None)
 def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
-    """Triggers a manual fact check/web search, using the 'messages' format."""
     if not history or not history[-1]['content']:
         return history, "Error: No final response to check.", gr.update(visible=False)
     last_user_prompt = ""
     for item in reversed(history):
         if item['role'] == 'user' and item['content']:
             last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
             break
-    if not last_user_prompt:
-        return history, "Error: Could not find the original user query.", gr.update(visible=False)
     web_results = web_search_tool(last_user_prompt)
     new_history = list(history)
     new_history[-1]['content'] += web_results
     return new_history, "✅ Double-checked with web facts.", gr.update(visible=False)
-def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], staged_image: Any) -> Tuple[str, List[Dict[str, str]], Any, gr.update, gr.update, gr.update, gr.update, gr.update]:
-    """
-    Simulates the automatic capture action by updating the UI components
-    to show the camera, and then immediately capturing (simulated).
-    """
-    _, chat_history, staged_image = user_turn(user_message, chat_history, staged_image)
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
         chat_history[-1]['content'] = "📸 Preparing camera capture..."
-    return "", chat_history, staged_image, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value="📸 Capturing in 3 seconds...", interactive=False), gr.update(value="➕")
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
-    # --- State Components ---
     stop_signal = gr.State(value=False)
-    is_voice_chat = gr.State(value=False)
-    staged_image = gr.State(value=None)
     menu_visible_state = gr.State(value=False)
     gr.HTML("<h1 style='text-align: center; color: #4B0082;'>🌙 Luna Chat Space</h1>")
-    # Hint Box
-    hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
-    # FIX: File Download Box is now INVISIBLE. Downloads will appear in chat.
-    file_download_output = gr.File(label="Generated File", visible=False)
-    # Fact Check button row
-    with gr.Row(visible=False) as fact_check_btn_row:
-        gr.Column(min_width=1)
-        btn_fact_check = gr.Button("Fact Check 🔎")
-        gr.Column(min_width=1)
-    # Chatbot Area
-    chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
-    # Webcam Capture Area (Hidden)
     with gr.Row(visible=False) as webcam_capture_row:
-        # type="numpy" ensures raw data is passed
         webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
         close_webcam_btn = gr.Button("✅ Use this image")
-    # Audio Recording Row (Hidden)
     with gr.Row(visible=False) as audio_record_row:
         audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
-    # Option Menu (Hidden)
     with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
-        # type="filepath" ensures a path string is passed
-        file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
         btn_take_photo = gr.Button("📸 Google Lens (Take Photo)")
         btn_add_files = gr.Button("📎 Upload File")
-    # Fixed Input Row (Footer)
     with gr.Row(variant="panel") as input_row:
         btn_menu = gr.Button("➕", interactive=True, size="sm")
         txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
         mic_btn = gr.Button("🎙️", interactive=True, size="sm")
         combined_btn = gr.Button("✈️", variant="primary", size="sm")
-    audio_output = gr.Audio(visible=False)
-    # Group all output components for convenience
     output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
     # --- WIRE EVENTS ---
-    # 1. Menu Button
-    btn_menu.click(
-        fn=toggle_menu,
-        inputs=[menu_visible_state],
-        outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu],
-        queue=False
-    )
-    # 2. File Upload
-    def prepare_file_upload():
-        return gr.update(visible=False), gr.update(value="➕"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
     btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
-    file_input.change(
-        fn=stage_file_upload,
-        inputs=[file_input],
-        outputs=[staged_image, hint_box, txt, file_input],
-        queue=False
-    )
-    # 3. 'Take photo' (Webcam)
-    btn_take_photo.click(
-        fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "📸 Camera Active. Capture an image.", gr.update(value="➕")),
-        inputs=[],
-        outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu],
-        queue=False
-    )
-    # 4. Webcam Close
     close_webcam_btn.click(
-        fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"📸 Photo staged: Click send (✈️) to process.", gr.update(value="")),
         inputs=[webcam_capture_component],
-        outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt],
         queue=False
     )
-    # 5. Mic wiring
-    mic_btn.click(
-        fn=lambda: (gr.update(visible=False), gr.update(visible=True), "🎙️ Recording..."),
-        inputs=[],
-        outputs=[input_row, audio_record_row, hint_box],
-        queue=False
-    ).then(
-        fn=simulate_recording_delay,
-        inputs=[],
-        outputs=[],
-        queue=False,
-    ).then(
-        fn=lambda: (gr.update(visible=True), gr.update(visible=False), "🎙️ Processing recording..."),
-        inputs=[],
-        outputs=[input_row, audio_record_row, hint_box],
-        queue=False,
-    ).then(
-        fn=transcribe_audio,
-        inputs=audio_input,
-        outputs=[txt, hint_box, txt, combined_btn, is_voice_chat, fact_check_btn_row],
-        queue=False
-    ).then(
-        fn=user_turn,
-        inputs=[txt, chatbot, staged_image],
-        outputs=[txt, chatbot, staged_image],
-        queue=False
-    ).then(
-        fn=chat_generator,
-        inputs=[txt, staged_image, chatbot, stop_signal, is_voice_chat],
         outputs=output_components,
         queue=True,
     )
-    # 6. Main Submission Logic (Text submit and Send button)
     generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
-    # Text submit (Enter key)
     txt.submit(
-        # FIX: user_turn now also takes staged_image as input and output
         fn=user_turn,
-        inputs=[txt, chatbot, staged_image],
-        outputs=[txt, chatbot, staged_image],
         queue=False
     ).then(
-        fn=chat_generator,
-        inputs=generator_inputs,
-        outputs=output_components,
         queue=True,
     )
-    # Send button click
     combined_btn.click(
-        # FIX: user_turn now also takes staged_image as input and output
         fn=user_turn,
-        inputs=[txt, chatbot, staged_image],
-        outputs=[txt, chatbot, staged_image],
         queue=False
     ).then(
         fn=chat_generator,
-        inputs=generator_inputs,
         outputs=output_components,
-        queue=True
-    )
-    # 7. Fact Check Button
-    btn_fact_check.click(
-        fn=manual_fact_check,
-        inputs=[chatbot],
-        outputs=[chatbot, hint_box, fact_check_btn_row],
-        queue=True
     )
-demo.queue(max_size=20).launch(server_name="0.0.0.0")

+# --- (Import statements remain the same) ---
 import gradio as gr
 import os
 import time
 from typing import List, Dict, Any, Tuple
 from PIL import Image
 from transformers import pipeline
+from gtts import gTTS
 from diffusers import StableDiffusionPipeline
+from docx import Document
+from pptx import Presentation
+from io import BytesIO
+import numpy as np # <-- Import NumPy for robust image check
+# --- (CONFIGURATIONS & MODEL LOADING remain the same) ---
+STT_DEVICE = "cpu"
 os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
 AUDIO_DIR = "audio_outputs"
+DOC_DIR = "doc_outputs"
 if not os.path.exists(AUDIO_DIR):
     os.makedirs(AUDIO_DIR)
 if not os.path.exists(DOC_DIR):
     os.makedirs(DOC_DIR)
 REPO_ID = "cosmosai471/Luna-v3"
 MODEL_FILE = "luna.gguf"
+LOCAL_MODEL_PATH = MODEL_FILE
 SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
+# --- (safe_del, LLM loading, Pipeline loading remain the same) ---
 llm = None
 try:
     print(f"Downloading {MODEL_FILE} from {REPO_ID}...")
     print("Initializing Llama...")
     llm = Llama(
         model_path=LOCAL_MODEL_PATH,
+        n_ctx=8192,
+        n_threads=4,
+        n_batch=256,
+        n_gpu_layers=0,
         verbose=False
     )
     print("✅ Luna Model loaded successfully!")
     print(f"❌ Error loading Luna model: {e}")
     class DummyLLM:
         def create_completion(self, *args, **kwargs):
             yield {'choices': [{'text': '[Intent: qa_general][Confidence: 0] ERROR: Luna model failed to load. Check logs and resources.'}]}
     llm = DummyLLM()
 stt_pipe = None
 try:
     stt_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=STT_DEVICE)
 image_pipe = None
 try:
+    VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
     image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
     print(f"✅ Loaded {VLM_MODEL_ID} for image processing.")
 except Exception as e:
 img_gen_pipe = None
 try:
     img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
+    img_gen_pipe.to(STT_DEVICE)
     print("✅ Loaded Stable Diffusion (v1-5) for image generation.")
 except Exception as e:
     print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
 # --- UTILITY FUNCTIONS ---
 def simulate_recording_delay():
     time.sleep(3)
+    return None
 def clean_response_stream(raw_text: str) -> str:
+    """Cleans up raw response text by removing tags and repeats."""
     clean_text = re.split(r'\nUser:|\nAssistant:|</s>|Intent|Action', raw_text, 1)[0].strip()
     clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
+    # Remove Intent and Confidence tags specifically for display
     clean_text = re.sub(r'\[Intent:\s*\w+\]|\[Confidence:\s*\d+\]', '', clean_text).strip()
     words = clean_text.split()
+    if len(words) > 4 and words[-2:] == words[-4:-2]:
         clean_text = ' '.join(words[:-2])
     return clean_text
 def web_search_tool(query: str) -> str:
+    time.sleep(1.5)
     print(f"Simulating Google Search fallback for: {query}")
     return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
+# FIX: Confidence check operates on RAW response string
 def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
     """
+    Checks confidence from the raw response tag. Triggers fallback if low.
+    Returns the *cleaned* response (or augmented one).
     """
     confidence_match = re.search(r'\[Confidence:\s*(\d+)\]', raw_response_with_tags)
     confidence_score = int(confidence_match.group(1)) if confidence_match else 0
+    # Always clean the response *after* parsing confidence
     cleaned_response = clean_response_stream(raw_response_with_tags)
+    if confidence_score < 70:
         print(f"Low confidence ({confidence_score}%) detected. Triggering Google Search fallback.")
         search_snippet = web_search_tool(prompt)
         if "error" in cleaned_response.lower() or confidence_score == 0:
              final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
         else:
+            # Append search results to the existing (low confidence) cleaned response
             final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
     else:
+        # High confidence, return the already cleaned response
         final_response = cleaned_response
     return final_response
+# FIX: Correct VQA prompt format and error handling
+def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
+    """
+    Uses the VLM pipeline (LLaVA) for VQA.
+    Returns the prompt injection string and a boolean indicating success.
+    """
     global image_pipe
+    success = False
     if image_pipe is None:
+        return f"[Image Processing Error: VLM model is not loaded.] **User Query:** {message}", success
     image = None
     try:
         if isinstance(image_data_or_path, str):
             image = Image.open(image_data_or_path).convert("RGB")
+        elif isinstance(image_data_or_path, np.ndarray): # Handle NumPy array from webcam
             image = Image.fromarray(image_data_or_path).convert("RGB")
         if image:
+            # FIX: Use the specific format required by llava-hf/llava-1.5-7b-hf
             vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
+            # Increased max_new_tokens for potentially longer VQA responses
+            results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
+            raw_vlm_output = results[0]['generated_text'] if results else "Error: VLM did not return text."
             # Extract just the assistant's part
             vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
+            if not vqa_response: # Handle case where split fails or response is empty
+                vqa_response = "VLM analysis failed or returned empty."
             del image
+            success = True
             prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
+            return prompt_injection, success
     except Exception as e:
         print(f"Image Pipeline Error: {e}")
+        return f"[Image Processing Error: {e}] **User Query:** {message}", success
+    # If image processing failed before VLM call
+    return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
+# --- (transcribe_audio, text_to_audio remain the same) ---
 def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
     if stt_pipe is None or audio_file_path is None:
         error_msg = "Error: Whisper model failed to load or no audio recorded."
         return "", error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), False, gr.update(visible=False)
     try:
         transcribed_text = stt_pipe(audio_file_path)["text"]
         new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
         return (
+            transcribed_text.strip(),
+            f"🎙️ Transcribed: '{transcribed_text.strip()}'",
+            gr.update(interactive=True),
+            new_button_update,
+            True,
             gr.update(visible=False)
         )
     except Exception as e:
         error_msg = f"Transcription Error: {e}"
         return "", error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), False, gr.update(visible=False)
 def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
     if not is_voice_chat:
+        return None
+    clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$|\(file=.*?\)', '', text, flags=re.DOTALL | re.MULTILINE) # Also remove file links for TTS
     if len(clean_text.strip()) > 5:
         try:
             audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
             tts = gTTS(text=clean_text.strip(), lang='en')
             tts.save(audio_output_path)
+            return audio_output_path
         except Exception as e:
             print(f"gTTS Error: {e}")
             return None
     return None
+# --- (INTENT_STATUS_MAP remains the same) ---
 INTENT_STATUS_MAP = {
     "code_generate": "Analyzing requirements and drafting code 💻...",
     "code_explain": "Reviewing code logic and writing explanation 💡...",
     "default": "Luna is thinking...",
 }
+# FIX: Updated get_intent_status to force VQA intent more reliably
+def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
+    """Parses intent/confidence, returns intent, status, cleaned text."""
     # 1. Parse Intent
     match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
     intent = match.group(1).lower() if match else "default"
+    # FIX: Force 'vqa' intent if the flow started with an image, regardless of model output
+    if is_vqa_flow:
         intent = "vqa"
     # 2. Clean Text (remove both tags for display)
     cleaned_text = re.sub(r'\[Intent:\s*\w+\]\s*', '', raw_response, count=1).strip()
     cleaned_text = re.sub(r'\[Confidence:\s*\d+\]\s*', '', cleaned_text, count=1).strip()
     # 3. Get Status
     status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
     return intent, status, cleaned_text
+# --- (generate_file_content remains the same) ---
 def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
     """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
     file_path = None
     try:
         if file_type == "image":
+            if img_gen_pipe is None: raise RuntimeError("Image generation model not loaded.")
             image = img_gen_pipe(content).images[0]
             file_filename = f"generated_img_{random.randint(1000, 9999)}.png"
             file_path = os.path.join(DOC_DIR, file_filename)
             image.save(file_path)
             display_content = f"🖼️ **Image Generated!**\n\n[Download {file_filename}](file={file_path})"
         elif file_type == "doc":
             doc = Document()
             doc.add_heading('Luna Generated Document', 0)
             doc.add_paragraph(content)
             file_filename = f"generated_doc_{random.randint(1000, 9999)}.docx"
             file_path = os.path.join(DOC_DIR, file_filename)
             doc.save(file_path)
             display_content = f"📄 **Document Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
         elif file_type == "ppt":
             prs = Presentation()
+            slide = prs.slides.add_slide(prs.slide_layouts[0])
             slide.shapes.title.text = "Luna Generated Presentation"
+            slide.placeholders[1].text = content[:100] + "..."
             file_filename = f"generated_ppt_{random.randint(1000, 9999)}.pptx"
             file_path = os.path.join(DOC_DIR, file_filename)
             prs.save(file_path)
             display_content = f"📊 **Presentation Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
         else:
             raise ValueError(f"Unknown file type: {file_type}")
         history[-1]['content'] = display_content
     except Exception as e:
+        error_msg = f"❌ **Error generating {file_type.upper()}:** {e}. Check logs/libs."
         history[-1]['content'] = error_msg
+        file_path = None
     return history, file_path
 # --- CORE GENERATOR FUNCTION ---
+def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
+    """Main generator function for streaming LLM response."""
     # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
     # 1. INITIAL HISTORY CHECK
     if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
+        yield history, False, "Error: Generator called in unexpected state.", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         return
     # 2. PRE-PROCESSING & CONTEXT
     last_user_index = len(history) - 2
+    original_message = history[last_user_index]['content'] # Get user msg from history
+    # FIX: Robust check for image/file presence using isinstance and None check.
     is_vqa_flow = False
+    if isinstance(image_input_data, str): # File upload path
+        is_vqa_flow = image_input_data != ""
+    elif isinstance(image_input_data, np.ndarray): # Webcam data
+        is_vqa_flow = image_input_data.size > 0 # Check if array is not empty
+    else: # Could be None or other types
+        is_vqa_flow = image_input_data is not None
+    vqa_success = False
     if is_vqa_flow:
         # Process image/VQA
+        processed_message, vqa_success = process_image(image_input_data, original_message)
+        # Update user message in history to show it was an image prompt
         history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
+        # Use the VQA-enriched message for the LLM prompt
+        llm_input_message = processed_message
     else:
+        llm_input_message = original_message
+        image_input_data = None # Ensure cleared if not VQA
+    # Build the final prompt string for the LLM
     prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
+    for item in history[:-1]: # Iterate through history up to the current turn
         role = item['role'].upper()
         content = item['content'] if item['content'] is not None else ""
+        if role == "ASSISTANT": prompt += f"LUNA: {content}\n"
+        elif role == "USER": prompt += f"USER: {content}\n"
+    prompt += f"USER: {llm_input_message}\nLUNA: " # Add final user input
+    # 3. HINT BOX & STREAM START
+    hint_text = "✨ Luna is starting to think..."
+    history[-1]['content'] = "" # Initialize assistant content
+    yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
+    time.sleep(0.5)
     # 4. DIRECT STREAMING
     full_response = ""
+    current_intent = "default"
     try:
         stream = llm.create_completion(
+            prompt=prompt, max_tokens=8192,
             stop=["USER:", "SYSTEM:", "</s>"],
+            echo=False, stream=True, temperature=0.7
         )
     except Exception as e:
         error_text = f"❌ Error generating response: {e}"
         history[-1]['content'] = error_text
+        yield history, False, error_text, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         return
     try:
         for output in stream:
             token = output["choices"][0].get("text", "")
             full_response += token
+            # Get intent, status hint, and cleaned text for display
+            current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success) # Pass VQA success status
+            history[-1]['content'] = display_text # Update chat display
+            yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
     except Exception as e:
+        _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
         error_msg = f"⚠️ Streaming interrupted: {e}"
+        history[-1]['content'] = final_response_text
+        yield history, False, error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=True), image_input_data, gr.update(), gr.update()
         return
     # 5. POST-PROCESSING & TOOL EXECUTION
     file_download_path = None
+    _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success) # Get final cleaned content
+    # 5a. File Generation/Tool Action based on final intent
     if current_intent == "image_generate":
+        yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         history, file_download_path = generate_file_content(content_for_tool, history, "image")
     elif current_intent == "doc_generate":
+        yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         history, file_download_path = generate_file_content(content_for_tool, history, "doc")
     elif current_intent == "ppt_generate":
+        yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
     elif current_intent == "open_google":
+        final_cleaned_response = content_for_tool + "\n\n🔗 **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
+        history[-1]['content'] = final_cleaned_response # Update content
     elif current_intent == "open_camera":
+        final_cleaned_response = content_for_tool + "\n\n📸 **Action:** Use the 'Google Lens' button to capture an image."
+        history[-1]['content'] = final_cleaned_response # Update content
+    # 5b. Confidence Check (only if NOT a tool intent)
+    TOOL_EXECUTION_INTENTS = ["image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"]
+    if current_intent not in TOOL_EXECUTION_INTENTS:
+        # Pass the RAW full_response (with tags) to confidence checker
+        final_response_content = check_confidence_and_augment(full_response, original_message)
+        history[-1]['content'] = final_response_content # Update content if augmented
     else:
+        # If it was a tool intent, the content is already set (or cleaned implicitly)
+        final_response_content = history[-1]['content']
     # 5c. TTS Generation
+    audio_file_path = text_to_audio(final_response_content, is_voice_chat)
     # 6. FINAL YIELD
+    hint = "✅ Response generated."
+    # We yield the path to the hidden file component to make it downloadable
+    # We yield None to staged_image state to clear it *after* generation
+    yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(), file_download_path
 # --- GRADIO WRAPPERS FOR UI ACTIONS ---
 def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
+    new_visibility = not current_visibility
     return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "➕")
+# FIX: user_turn now only adds history if input exists, DOES NOT clear staged_image
+def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
     """
+    Appends the user message to the chat history if text or image is provided.
+    Clears the input box. Does NOT clear the staged_image state here.
     """
     has_text = bool(user_message)
+    # Robust check for image presence
     has_image = False
+    if isinstance(staged_image_input, str):
+        has_image = staged_image_input != ""
+    elif isinstance(staged_image_input, np.ndarray):
+        has_image = staged_image_input.size > 0
     else:
+        has_image = staged_image_input is not None
+    # If no input, do nothing
     if not has_text and not has_image:
+        return user_message, chat_history # Return original inputs
+    # If the last turn is still generating, do nothing to prevent race conditions
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
+         return user_message, chat_history
+    # Determine message content
     if not has_text and has_image:
+        user_message_to_add = "Analyzing Staged Media."
+    else:
+        user_message_to_add = user_message
+    # Add messages to history
+    chat_history.append({"role": "user", "content": user_message_to_add})
+    chat_history.append({"role": "assistant", "content": ""}) # Add placeholder
+    # Clear only the text input box
+    return "", chat_history
 def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
     if file_path:
+        return file_path, f"📎 File staged: {os.path.basename(file_path)}. Click send (✈️).", gr.update(value="", interactive=True), gr.update(interactive=False)
+    return None, "File upload cancelled.", gr.update(value="", interactive=True), gr.update(interactive=False)
+# FIX: Reinstate clear_staged_media
 def clear_staged_media() -> gr.update:
+    """Clears the staged media state component."""
     return gr.update(value=None)
+# --- (manual_fact_check, auto_capture_camera remain largely the same, ensure they use history format correctly) ---
 def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
     if not history or not history[-1]['content']:
         return history, "Error: No final response to check.", gr.update(visible=False)
     last_user_prompt = ""
     for item in reversed(history):
         if item['role'] == 'user' and item['content']:
             last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
             break
+    if not last_user_prompt: return history, "Error: Could not find query.", gr.update(visible=False)
     web_results = web_search_tool(last_user_prompt)
     new_history = list(history)
     new_history[-1]['content'] += web_results
     return new_history, "✅ Double-checked with web facts.", gr.update(visible=False)
+def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]], Any, gr.update, gr.update, gr.update, gr.update, gr.update]:
+    # Use user_turn logic to setup the chat history correctly for the intent flow
+    _, chat_history = user_turn(user_message, chat_history, staged_image_input) # Pass staged image
+    # Update the last assistant response placeholder with a status message
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
         chat_history[-1]['content'] = "📸 Preparing camera capture..."
+    # Update UI to show the webcam (start capture simulation)
+    # Note: staged_image is NOT cleared here by user_turn
+    return "", chat_history, staged_image_input, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value="📸 Capturing in 3 seconds...", interactive=False), gr.update(value="➕")
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
+    # --- (State Components remain the same) ---
     stop_signal = gr.State(value=False)
+    is_voice_chat = gr.State(value=False)
+    staged_image = gr.State(value=None)
     menu_visible_state = gr.State(value=False)
     gr.HTML("<h1 style='text-align: center; color: #4B0082;'>🌙 Luna Chat Space</h1>")
+    hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
+    file_download_output = gr.File(label="Generated File", visible=False) # Hidden file component
+    with gr.Row(visible=False) as fact_check_btn_row:
+        gr.Column(min_width=1); btn_fact_check = gr.Button("Fact Check 🔎"); gr.Column(min_width=1)
+    chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
     with gr.Row(visible=False) as webcam_capture_row:
         webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
         close_webcam_btn = gr.Button("✅ Use this image")
     with gr.Row(visible=False) as audio_record_row:
         audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
     with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
+        file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
         btn_take_photo = gr.Button("📸 Google Lens (Take Photo)")
         btn_add_files = gr.Button("📎 Upload File")
     with gr.Row(variant="panel") as input_row:
         btn_menu = gr.Button("➕", interactive=True, size="sm")
         txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
         mic_btn = gr.Button("🎙️", interactive=True, size="sm")
         combined_btn = gr.Button("✈️", variant="primary", size="sm")
+    audio_output = gr.Audio(visible=False)
+    # Output components list now reflects the hidden file component
     output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
     # --- WIRE EVENTS ---
+    # --- (Menu, File Upload, Take Photo events remain the same) ---
+    btn_menu.click(fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False)
+    def prepare_file_upload(): return gr.update(visible=False), gr.update(value="➕"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
     btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
+    file_input.change(fn=stage_file_upload, inputs=[file_input], outputs=[staged_image, hint_box, txt, file_input], queue=False)
+    btn_take_photo.click(fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "📸 Camera Active. Capture an image.", gr.update(value="➕")), inputs=[], outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu], queue=False)
+    # Webcam Close (stages the image data/path)
     close_webcam_btn.click(
+        fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"📸 Photo staged: Click send (✈️).", gr.update(value="")),
         inputs=[webcam_capture_component],
+        outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt], # staged_image gets the NumPy array here
         queue=False
     )
+    # --- (Mic wiring remains the same, ensure user_turn includes staged_image) ---
+    mic_btn.click(fn=lambda: (gr.update(visible=False), gr.update(visible=True), "🎙️ Recording..."), inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False)\
+    .then(fn=simulate_recording_delay, inputs=[], outputs=[], queue=False)\
+    .then(fn=lambda: (gr.update(visible=True), gr.update(visible=False), "🎙️ Processing..."), inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False)\
+    .then(fn=transcribe_audio, inputs=audio_input, outputs=[txt, hint_box, txt, combined_btn, is_voice_chat, fact_check_btn_row], queue=False)\
+    .then(fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False) # staged_image is passed but not modified here
+    .then(
+        fn=chat_generator,
+        inputs=[txt, staged_image, chatbot, stop_signal, is_voice_chat], # staged_image is read here
         outputs=output_components,
         queue=True,
+    ).then(
+        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear staged_image AFTER generation
     )
+    # Main Submission Logic
+    # FIX: Pass staged_image to user_turn, but DO NOT modify it there.
+    #      Clear staged_image using clear_staged_media *after* chat_generator runs.
     generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
     txt.submit(
         fn=user_turn,
+        inputs=[txt, chatbot, staged_image], # Pass staged_image state
+        outputs=[txt, chatbot], # user_turn only outputs text and history
         queue=False
     ).then(
+        fn=chat_generator,
+        inputs=generator_inputs, # Use the state value here
+        outputs=output_components,
         queue=True,
+    ).then(
+        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear state AFTER generation
     )
     combined_btn.click(
         fn=user_turn,
+        inputs=[txt, chatbot, staged_image], # Pass staged_image state
+        outputs=[txt, chatbot], # user_turn only outputs text and history
         queue=False
     ).then(
         fn=chat_generator,
+        inputs=generator_inputs, # Use the state value here
         outputs=output_components,
+        queue=True,
+    ).then(
+        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear state AFTER generation
     )
+    # --- (Fact Check event remains the same) ---
+    btn_fact_check.click(fn=manual_fact_check, inputs=[chatbot], outputs=[chatbot, hint_box, fact_check_btn_row], queue=True)
+    demo.queue(max_size=20).launch(server_name="0.0.0.0")