Spaces:

cosmosai471
/

come_onnn

Running

App Files Files Community

cosmosai471 commited on 26 days ago

Commit

a7a6d88

verified ·

1 Parent(s): 4f7f656

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -192

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# --- (Import statements remain the same) ---
 import gradio as gr
 import os
 import time
@@ -15,36 +14,44 @@ from diffusers import StableDiffusionPipeline
 from docx import Document
 from pptx import Presentation
 from io import BytesIO
-import numpy as np # <-- Import NumPy for robust image check
-# --- (CONFIGURATIONS & MODEL LOADING remain the same) ---
-STT_DEVICE = "cpu"
 os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
 AUDIO_DIR = "audio_outputs"
-DOC_DIR = "doc_outputs"
 if not os.path.exists(AUDIO_DIR):
     os.makedirs(AUDIO_DIR)
 if not os.path.exists(DOC_DIR):
     os.makedirs(DOC_DIR)
 REPO_ID = "cosmosai471/Luna-v3"
 MODEL_FILE = "luna.gguf"
-LOCAL_MODEL_PATH = MODEL_FILE
 SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
-# --- (safe_del, LLM loading, Pipeline loading remain the same) ---
 llm = None
 try:
     print(f"Downloading {MODEL_FILE} from {REPO_ID}...")
     hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILE, local_dir=".")
     if not os.path.exists(LOCAL_MODEL_PATH):
         raise FileNotFoundError(f"Download failed for {MODEL_FILE}")
     print("Initializing Llama...")
     llm = Llama(
         model_path=LOCAL_MODEL_PATH,
-        n_ctx=8192,
-        n_threads=4,
-        n_batch=256,
-        n_gpu_layers=0,
         verbose=False
     )
     print("✅ Luna Model loaded successfully!")
@@ -64,7 +71,7 @@ except Exception as e:
 image_pipe = None
 try:
-    VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
     image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
     print(f"✅ Loaded {VLM_MODEL_ID} for image processing.")
 except Exception as e:
@@ -73,43 +80,37 @@ except Exception as e:
 img_gen_pipe = None
 try:
     img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
-    img_gen_pipe.to(STT_DEVICE)
     print("✅ Loaded Stable Diffusion (v1-5) for image generation.")
 except Exception as e:
     print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
 # --- UTILITY FUNCTIONS ---
 def simulate_recording_delay():
     time.sleep(3)
-    return None
 def clean_response_stream(raw_text: str) -> str:
     """Cleans up raw response text by removing tags and repeats."""
     clean_text = re.split(r'\nUser:|\nAssistant:|</s>|Intent|Action', raw_text, 1)[0].strip()
     clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
-    # Remove Intent and Confidence tags specifically for display
     clean_text = re.sub(r'\[Intent:\s*\w+\]|\[Confidence:\s*\d+\]', '', clean_text).strip()
     words = clean_text.split()
-    if len(words) > 4 and words[-2:] == words[-4:-2]:
         clean_text = ' '.join(words[:-2])
     return clean_text
 def web_search_tool(query: str) -> str:
-    time.sleep(1.5)
     print(f"Simulating Google Search fallback for: {query}")
     return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
-# FIX: Confidence check operates on RAW response string
 def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
-    """
-    Checks confidence from the raw response tag. Triggers fallback if low.
-    Returns the *cleaned* response (or augmented one).
-    """
     confidence_match = re.search(r'\[Confidence:\s*(\d+)\]', raw_response_with_tags)
     confidence_score = int(confidence_match.group(1)) if confidence_match else 0
-    # Always clean the response *after* parsing confidence
     cleaned_response = clean_response_stream(raw_response_with_tags)
     if confidence_score < 70:
@@ -118,20 +119,13 @@ def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> st
         if "error" in cleaned_response.lower() or confidence_score == 0:
              final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
         else:
-            # Append search results to the existing (low confidence) cleaned response
             final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
     else:
-        # High confidence, return the already cleaned response
         final_response = cleaned_response
     return final_response
-# FIX: Correct VQA prompt format and error handling
 def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
-    """
-    Uses the VLM pipeline (LLaVA) for VQA.
-    Returns the prompt injection string and a boolean indicating success.
-    """
     global image_pipe
     success = False
     if image_pipe is None:
@@ -141,36 +135,27 @@ def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
     try:
         if isinstance(image_data_or_path, str):
             image = Image.open(image_data_or_path).convert("RGB")
-        elif isinstance(image_data_or_path, np.ndarray): # Handle NumPy array from webcam
             image = Image.fromarray(image_data_or_path).convert("RGB")
         if image:
-            # FIX: Use the specific format required by llava-hf/llava-1.5-7b-hf
             vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
-            # Increased max_new_tokens for potentially longer VQA responses
             results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
             raw_vlm_output = results[0]['generated_text'] if results else "Error: VLM did not return text."
-            # Extract just the assistant's part
             vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
-            if not vqa_response: # Handle case where split fails or response is empty
-                vqa_response = "VLM analysis failed or returned empty."
             del image
             success = True
             prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
             return prompt_injection, success
     except Exception as e:
         print(f"Image Pipeline Error: {e}")
         return f"[Image Processing Error: {e}] **User Query:** {message}", success
-    # If image processing failed before VLM call
     return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
-# --- (transcribe_audio, text_to_audio remain the same) ---
 def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
     if stt_pipe is None or audio_file_path is None:
         error_msg = "Error: Whisper model failed to load or no audio recorded."
@@ -179,11 +164,11 @@ def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.upda
         transcribed_text = stt_pipe(audio_file_path)["text"]
         new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
         return (
-            transcribed_text.strip(),
-            f"🎙️ Transcribed: '{transcribed_text.strip()}'",
-            gr.update(interactive=True),
-            new_button_update,
-            True,
             gr.update(visible=False)
         )
     except Exception as e:
@@ -192,20 +177,19 @@ def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.upda
 def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
     if not is_voice_chat:
-        return None
-    clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$|\(file=.*?\)', '', text, flags=re.DOTALL | re.MULTILINE) # Also remove file links for TTS
     if len(clean_text.strip()) > 5:
         try:
             audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
             tts = gTTS(text=clean_text.strip(), lang='en')
             tts.save(audio_output_path)
-            return audio_output_path
         except Exception as e:
             print(f"gTTS Error: {e}")
             return None
     return None
-# --- (INTENT_STATUS_MAP remains the same) ---
 INTENT_STATUS_MAP = {
     "code_generate": "Analyzing requirements and drafting code 💻...",
     "code_explain": "Reviewing code logic and writing explanation 💡...",
@@ -220,29 +204,17 @@ INTENT_STATUS_MAP = {
     "default": "Luna is thinking...",
 }
-# FIX: Updated get_intent_status to force VQA intent more reliably
 def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
     """Parses intent/confidence, returns intent, status, cleaned text."""
-    # 1. Parse Intent
     match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
     intent = match.group(1).lower() if match else "default"
-    # FIX: Force 'vqa' intent if the flow started with an image, regardless of model output
     if is_vqa_flow:
         intent = "vqa"
-    # 2. Clean Text (remove both tags for display)
     cleaned_text = re.sub(r'\[Intent:\s*\w+\]\s*', '', raw_response, count=1).strip()
     cleaned_text = re.sub(r'\[Confidence:\s*\d+\]\s*', '', cleaned_text, count=1).strip()
-    # 3. Get Status
     status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
     return intent, status, cleaned_text
-# --- (generate_file_content remains the same) ---
 def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
     """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
     file_path = None
@@ -282,60 +254,51 @@ def generate_file_content(content: str, history: List[Dict[str, str]], file_type
 # --- CORE GENERATOR FUNCTION ---
 def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
-    """Main generator function for streaming LLM response."""
     # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
-    # 1. INITIAL HISTORY CHECK
     if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
         yield history, False, "Error: Generator called in unexpected state.", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         return
-    # 2. PRE-PROCESSING & CONTEXT
     last_user_index = len(history) - 2
-    original_message = history[last_user_index]['content'] # Get user msg from history
-    # FIX: Robust check for image/file presence using isinstance and None check.
     is_vqa_flow = False
-    if isinstance(image_input_data, str): # File upload path
         is_vqa_flow = image_input_data != ""
-    elif isinstance(image_input_data, np.ndarray): # Webcam data
-        is_vqa_flow = image_input_data.size > 0 # Check if array is not empty
-    else: # Could be None or other types
         is_vqa_flow = image_input_data is not None
     vqa_success = False
     if is_vqa_flow:
-        # Process image/VQA
         processed_message, vqa_success = process_image(image_input_data, original_message)
-        # Update user message in history to show it was an image prompt
         history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
-        # Use the VQA-enriched message for the LLM prompt
         llm_input_message = processed_message
     else:
         llm_input_message = original_message
-        image_input_data = None # Ensure cleared if not VQA
-    # Build the final prompt string for the LLM
     prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
-    for item in history[:-1]: # Iterate through history up to the current turn
         role = item['role'].upper()
         content = item['content'] if item['content'] is not None else ""
         if role == "ASSISTANT": prompt += f"LUNA: {content}\n"
         elif role == "USER": prompt += f"USER: {content}\n"
-    prompt += f"USER: {llm_input_message}\nLUNA: " # Add final user input
-    # 3. HINT BOX & STREAM START
-    hint_text = "✨ Luna is starting to think..."
-    history[-1]['content'] = "" # Initialize assistant content
     yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
-    time.sleep(0.5)
-    # 4. DIRECT STREAMING
     full_response = ""
-    current_intent = "default"
     try:
         stream = llm.create_completion(
-            prompt=prompt, max_tokens=8192,
             stop=["USER:", "SYSTEM:", "</s>"],
             echo=False, stream=True, temperature=0.7
         )
@@ -349,9 +312,8 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
         for output in stream:
             token = output["choices"][0].get("text", "")
             full_response += token
-            # Get intent, status hint, and cleaned text for display
-            current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success) # Pass VQA success status
-            history[-1]['content'] = display_text # Update chat display
             yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
     except Exception as e:
         _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
@@ -362,9 +324,8 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
     # 5. POST-PROCESSING & TOOL EXECUTION
     file_download_path = None
-    _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success) # Get final cleaned content
-    # 5a. File Generation/Tool Action based on final intent
     if current_intent == "image_generate":
         yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         history, file_download_path = generate_file_content(content_for_tool, history, "image")
@@ -376,45 +337,33 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
         history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
     elif current_intent == "open_google":
         final_cleaned_response = content_for_tool + "\n\n🔗 **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
-        history[-1]['content'] = final_cleaned_response # Update content
     elif current_intent == "open_camera":
         final_cleaned_response = content_for_tool + "\n\n📸 **Action:** Use the 'Google Lens' button to capture an image."
-        history[-1]['content'] = final_cleaned_response # Update content
-    # 5b. Confidence Check (only if NOT a tool intent)
     TOOL_EXECUTION_INTENTS = ["image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"]
     if current_intent not in TOOL_EXECUTION_INTENTS:
-        # Pass the RAW full_response (with tags) to confidence checker
         final_response_content = check_confidence_and_augment(full_response, original_message)
-        history[-1]['content'] = final_response_content # Update content if augmented
     else:
-        # If it was a tool intent, the content is already set (or cleaned implicitly)
         final_response_content = history[-1]['content']
-    # 5c. TTS Generation
-    audio_file_path = text_to_audio(final_response_content, is_voice_chat)
-    # 6. FINAL YIELD
     hint = "✅ Response generated."
-    # We yield the path to the hidden file component to make it downloadable
-    # We yield None to staged_image state to clear it *after* generation
     yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(), file_download_path
 # --- GRADIO WRAPPERS FOR UI ACTIONS ---
 def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
-    new_visibility = not current_visibility
     return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "➕")
-# FIX: user_turn now only adds history if input exists, DOES NOT clear staged_image
 def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
-    """
-    Appends the user message to the chat history if text or image is provided.
-    Clears the input box. Does NOT clear the staged_image state here.
-    """
     has_text = bool(user_message)
-    # Robust check for image presence
     has_image = False
     if isinstance(staged_image_input, str):
         has_image = staged_image_input != ""
@@ -423,25 +372,20 @@ def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_imag
     else:
         has_image = staged_image_input is not None
-    # If no input, do nothing
     if not has_text and not has_image:
-        return user_message, chat_history # Return original inputs
-    # If the last turn is still generating, do nothing to prevent race conditions
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
          return user_message, chat_history
-    # Determine message content
     if not has_text and has_image:
         user_message_to_add = "Analyzing Staged Media."
     else:
         user_message_to_add = user_message
-    # Add messages to history
-    chat_history.append({"role": "user", "content": user_message_to_add})
-    chat_history.append({"role": "assistant", "content": ""}) # Add placeholder
-    # Clear only the text input box
     return "", chat_history
 def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
@@ -449,12 +393,10 @@ def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
         return file_path, f"📎 File staged: {os.path.basename(file_path)}. Click send (✈️).", gr.update(value="", interactive=True), gr.update(interactive=False)
     return None, "File upload cancelled.", gr.update(value="", interactive=True), gr.update(interactive=False)
-# FIX: Reinstate clear_staged_media
 def clear_staged_media() -> gr.update:
     """Clears the staged media state component."""
     return gr.update(value=None)
-# --- (manual_fact_check, auto_capture_camera remain largely the same, ensure they use history format correctly) ---
 def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
     if not history or not history[-1]['content']:
         return history, "Error: No final response to check.", gr.update(visible=False)
@@ -470,44 +412,41 @@ def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str
     return new_history, "✅ Double-checked with web facts.", gr.update(visible=False)
 def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]], Any, gr.update, gr.update, gr.update, gr.update, gr.update]:
-    # Use user_turn logic to setup the chat history correctly for the intent flow
-    _, chat_history = user_turn(user_message, chat_history, staged_image_input) # Pass staged image
-    # Update the last assistant response placeholder with a status message
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
         chat_history[-1]['content'] = "📸 Preparing camera capture..."
-    # Update UI to show the webcam (start capture simulation)
-    # Note: staged_image is NOT cleared here by user_turn
     return "", chat_history, staged_image_input, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value="📸 Capturing in 3 seconds...", interactive=False), gr.update(value="➕")
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
-    # --- (State Components remain the same) ---
     stop_signal = gr.State(value=False)
-    is_voice_chat = gr.State(value=False)
-    staged_image = gr.State(value=None)
     menu_visible_state = gr.State(value=False)
     gr.HTML("<h1 style='text-align: center; color: #4B0082;'>🌙 Luna Chat Space</h1>")
-    hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
-    file_download_output = gr.File(label="Generated File", visible=False) # Hidden file component
     with gr.Row(visible=False) as fact_check_btn_row:
         gr.Column(min_width=1); btn_fact_check = gr.Button("Fact Check 🔎"); gr.Column(min_width=1)
-    chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
     with gr.Row(visible=False) as webcam_capture_row:
         webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
         close_webcam_btn = gr.Button("✅ Use this image")
     with gr.Row(visible=False) as audio_record_row:
         audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
     with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
-        file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
         btn_take_photo = gr.Button("📸 Google Lens (Take Photo)")
         btn_add_files = gr.Button("📎 Upload File")
@@ -516,79 +455,74 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
         txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
         mic_btn = gr.Button("🎙️", interactive=True, size="sm")
         combined_btn = gr.Button("✈️", variant="primary", size="sm")
-    audio_output = gr.Audio(visible=False)
-    # Output components list now reflects the hidden file component
     output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
     # --- WIRE EVENTS ---
-    # --- (Menu, File Upload, Take Photo events remain the same) ---
-    btn_menu.click(fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False)
     def prepare_file_upload(): return gr.update(visible=False), gr.update(value="➕"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
     btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
-    file_input.change(fn=stage_file_upload, inputs=[file_input], outputs=[staged_image, hint_box, txt, file_input], queue=False)
-    btn_take_photo.click(fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "📸 Camera Active. Capture an image.", gr.update(value="➕")), inputs=[], outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu], queue=False)
-    # Webcam Close (stages the image data/path)
     close_webcam_btn.click(
         fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"📸 Photo staged: Click send (✈️).", gr.update(value="")),
-        inputs=[webcam_capture_component],
-        outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt], # staged_image gets the NumPy array here
-        queue=False
     )
-    # --- (Mic wiring remains the same, ensure user_turn includes staged_image) ---
-    mic_btn.click(fn=lambda: (gr.update(visible=False), gr.update(visible=True), "🎙️ Recording..."), inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False)\
-    .then(fn=simulate_recording_delay, inputs=[], outputs=[], queue=False)\
-    .then(fn=lambda: (gr.update(visible=True), gr.update(visible=False), "🎙️ Processing..."), inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False)\
-    .then(fn=transcribe_audio, inputs=audio_input, outputs=[txt, hint_box, txt, combined_btn, is_voice_chat, fact_check_btn_row], queue=False)\
-    .then(fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False) # staged_image is passed but not modified here
-    .then(
-        fn=chat_generator,
-        inputs=[txt, staged_image, chatbot, stop_signal, is_voice_chat], # staged_image is read here
-        outputs=output_components,
-        queue=True,
     ).then(
-        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear staged_image AFTER generation
     )
-    # Main Submission Logic
-    # FIX: Pass staged_image to user_turn, but DO NOT modify it there.
-    #      Clear staged_image using clear_staged_media *after* chat_generator runs.
     generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
     txt.submit(
-        fn=user_turn,
-        inputs=[txt, chatbot, staged_image], # Pass staged_image state
-        outputs=[txt, chatbot], # user_turn only outputs text and history
-        queue=False
     ).then(
-        fn=chat_generator,
-        inputs=generator_inputs, # Use the state value here
-        outputs=output_components,
-        queue=True,
     ).then(
-        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear state AFTER generation
     )
     combined_btn.click(
-        fn=user_turn,
-        inputs=[txt, chatbot, staged_image], # Pass staged_image state
-        outputs=[txt, chatbot], # user_turn only outputs text and history
-        queue=False
     ).then(
-        fn=chat_generator,
-        inputs=generator_inputs, # Use the state value here
-        outputs=output_components,
-        queue=True,
     ).then(
-        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear state AFTER generation
     )
-    # --- (Fact Check event remains the same) ---
-    btn_fact_check.click(fn=manual_fact_check, inputs=[chatbot], outputs=[chatbot, hint_box, fact_check_btn_row], queue=True)
-    demo.queue(max_size=20).launch(server_name="0.0.0.0")

 import gradio as gr
 import os
 import time
 from docx import Document
 from pptx import Presentation
 from io import BytesIO
+import numpy as np
+# --- CONFIGURATION & INITIALIZATION ---
+STT_DEVICE = "cpu"
 os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
 AUDIO_DIR = "audio_outputs"
+DOC_DIR = "doc_outputs"
 if not os.path.exists(AUDIO_DIR):
     os.makedirs(AUDIO_DIR)
 if not os.path.exists(DOC_DIR):
     os.makedirs(DOC_DIR)
 REPO_ID = "cosmosai471/Luna-v3"
 MODEL_FILE = "luna.gguf"
+LOCAL_MODEL_PATH = MODEL_FILE
 SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
+def safe_del(self):
+    try:
+        if hasattr(self, "close") and callable(self.close):
+            self.close()
+    except Exception:
+        pass
+Llama.__del__ = safe_del
+# --- MODEL LOADING ---
 llm = None
 try:
     print(f"Downloading {MODEL_FILE} from {REPO_ID}...")
     hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILE, local_dir=".")
     if not os.path.exists(LOCAL_MODEL_PATH):
         raise FileNotFoundError(f"Download failed for {MODEL_FILE}")
     print("Initializing Llama...")
     llm = Llama(
         model_path=LOCAL_MODEL_PATH,
+        n_ctx=8192,
+        n_threads=4,
+        n_batch=256,
+        n_gpu_layers=0,
         verbose=False
     )
     print("✅ Luna Model loaded successfully!")
 image_pipe = None
 try:
+    VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
     image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
     print(f"✅ Loaded {VLM_MODEL_ID} for image processing.")
 except Exception as e:
 img_gen_pipe = None
 try:
     img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
+    img_gen_pipe.to(STT_DEVICE)
     print("✅ Loaded Stable Diffusion (v1-5) for image generation.")
 except Exception as e:
     print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
 # --- UTILITY FUNCTIONS ---
 def simulate_recording_delay():
     time.sleep(3)
+    return None
 def clean_response_stream(raw_text: str) -> str:
     """Cleans up raw response text by removing tags and repeats."""
     clean_text = re.split(r'\nUser:|\nAssistant:|</s>|Intent|Action', raw_text, 1)[0].strip()
     clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
     clean_text = re.sub(r'\[Intent:\s*\w+\]|\[Confidence:\s*\d+\]', '', clean_text).strip()
     words = clean_text.split()
+    if len(words) > 4 and words[-2:] == words[-4:-2]:
         clean_text = ' '.join(words[:-2])
     return clean_text
 def web_search_tool(query: str) -> str:
+    time.sleep(1.5)
     print(f"Simulating Google Search fallback for: {query}")
     return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
 def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
+    """Checks confidence from the raw response tag. Triggers fallback if low."""
     confidence_match = re.search(r'\[Confidence:\s*(\d+)\]', raw_response_with_tags)
     confidence_score = int(confidence_match.group(1)) if confidence_match else 0
     cleaned_response = clean_response_stream(raw_response_with_tags)
     if confidence_score < 70:
         if "error" in cleaned_response.lower() or confidence_score == 0:
              final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
         else:
             final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
     else:
         final_response = cleaned_response
     return final_response
 def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
+    """Uses the VLM pipeline (LLaVA) for VQA."""
     global image_pipe
     success = False
     if image_pipe is None:
     try:
         if isinstance(image_data_or_path, str):
             image = Image.open(image_data_or_path).convert("RGB")
+        elif isinstance(image_data_or_path, np.ndarray):
             image = Image.fromarray(image_data_or_path).convert("RGB")
         if image:
             vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
             results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
             raw_vlm_output = results[0]['generated_text'] if results else "Error: VLM did not return text."
             vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
+            if not vqa_response: vqa_response = "VLM analysis failed or returned empty."
             del image
             success = True
             prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
             return prompt_injection, success
     except Exception as e:
         print(f"Image Pipeline Error: {e}")
         return f"[Image Processing Error: {e}] **User Query:** {message}", success
     return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
 def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
     if stt_pipe is None or audio_file_path is None:
         error_msg = "Error: Whisper model failed to load or no audio recorded."
         transcribed_text = stt_pipe(audio_file_path)["text"]
         new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
         return (
+            transcribed_text.strip(),
+            f"🎙️ Transcribed: '{transcribed_text.strip()}'",
+            gr.update(interactive=True),
+            new_button_update,
+            True,
             gr.update(visible=False)
         )
     except Exception as e:
 def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
     if not is_voice_chat:
+        return None
+    clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$|\(file=.*?\)', '', text, flags=re.DOTALL | re.MULTILINE)
     if len(clean_text.strip()) > 5:
         try:
             audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
             tts = gTTS(text=clean_text.strip(), lang='en')
             tts.save(audio_output_path)
+            return audio_output_path
         except Exception as e:
             print(f"gTTS Error: {e}")
             return None
     return None
 INTENT_STATUS_MAP = {
     "code_generate": "Analyzing requirements and drafting code 💻...",
     "code_explain": "Reviewing code logic and writing explanation 💡...",
     "default": "Luna is thinking...",
 }
 def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
     """Parses intent/confidence, returns intent, status, cleaned text."""
     match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
     intent = match.group(1).lower() if match else "default"
     if is_vqa_flow:
         intent = "vqa"
     cleaned_text = re.sub(r'\[Intent:\s*\w+\]\s*', '', raw_response, count=1).strip()
     cleaned_text = re.sub(r'\[Confidence:\s*\d+\]\s*', '', cleaned_text, count=1).strip()
     status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
     return intent, status, cleaned_text
 def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
     """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
     file_path = None
 # --- CORE GENERATOR FUNCTION ---
 def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
     # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
     if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
         yield history, False, "Error: Generator called in unexpected state.", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         return
     last_user_index = len(history) - 2
+    original_message = history[last_user_index]['content']
     is_vqa_flow = False
+    if isinstance(image_input_data, str):
         is_vqa_flow = image_input_data != ""
+    elif isinstance(image_input_data, np.ndarray):
+        is_vqa_flow = image_input_data.size > 0
+    else:
         is_vqa_flow = image_input_data is not None
     vqa_success = False
     if is_vqa_flow:
         processed_message, vqa_success = process_image(image_input_data, original_message)
         history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
         llm_input_message = processed_message
     else:
         llm_input_message = original_message
+        image_input_data = None
     prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
+    for item in history[:-1]:
         role = item['role'].upper()
         content = item['content'] if item['content'] is not None else ""
         if role == "ASSISTANT": prompt += f"LUNA: {content}\n"
         elif role == "USER": prompt += f"USER: {content}\n"
+    prompt += f"USER: {llm_input_message}\nLUNA: "
+    hint_text = "✨ Luna is starting to think..."
+    history[-1]['content'] = ""
     yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
+    time.sleep(0.5)
     full_response = ""
+    current_intent = "default"
     try:
         stream = llm.create_completion(
+            prompt=prompt, max_tokens=8192,
             stop=["USER:", "SYSTEM:", "</s>"],
             echo=False, stream=True, temperature=0.7
         )
         for output in stream:
             token = output["choices"][0].get("text", "")
             full_response += token
+            current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
+            history[-1]['content'] = display_text
             yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
     except Exception as e:
         _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
     # 5. POST-PROCESSING & TOOL EXECUTION
     file_download_path = None
+    _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success)
     if current_intent == "image_generate":
         yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
         history, file_download_path = generate_file_content(content_for_tool, history, "image")
         history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
     elif current_intent == "open_google":
         final_cleaned_response = content_for_tool + "\n\n🔗 **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
+        history[-1]['content'] = final_cleaned_response
     elif current_intent == "open_camera":
         final_cleaned_response = content_for_tool + "\n\n📸 **Action:** Use the 'Google Lens' button to capture an image."
+        history[-1]['content'] = final_cleaned_response
     TOOL_EXECUTION_INTENTS = ["image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"]
     if current_intent not in TOOL_EXECUTION_INTENTS:
         final_response_content = check_confidence_and_augment(full_response, original_message)
+        history[-1]['content'] = final_response_content
     else:
         final_response_content = history[-1]['content']
+    audio_file_path = text_to_audio(final_response_content, is_voice_chat)
     hint = "✅ Response generated."
     yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(), file_download_path
 # --- GRADIO WRAPPERS FOR UI ACTIONS ---
 def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
+    new_visibility = not current_visibility
     return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "➕")
 def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
+    """Appends the user message to the chat history if text or image is provided."""
     has_text = bool(user_message)
     has_image = False
     if isinstance(staged_image_input, str):
         has_image = staged_image_input != ""
     else:
         has_image = staged_image_input is not None
     if not has_text and not has_image:
+        return user_message, chat_history
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
          return user_message, chat_history
     if not has_text and has_image:
         user_message_to_add = "Analyzing Staged Media."
     else:
         user_message_to_add = user_message
+    chat_history.append({"role": "user", "content": user_message_to_add})
+    chat_history.append({"role": "assistant", "content": ""})
     return "", chat_history
 def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
         return file_path, f"📎 File staged: {os.path.basename(file_path)}. Click send (✈️).", gr.update(value="", interactive=True), gr.update(interactive=False)
     return None, "File upload cancelled.", gr.update(value="", interactive=True), gr.update(interactive=False)
 def clear_staged_media() -> gr.update:
     """Clears the staged media state component."""
     return gr.update(value=None)
 def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
     if not history or not history[-1]['content']:
         return history, "Error: No final response to check.", gr.update(visible=False)
     return new_history, "✅ Double-checked with web facts.", gr.update(visible=False)
 def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]], Any, gr.update, gr.update, gr.update, gr.update, gr.update]:
+    _, chat_history = user_turn(user_message, chat_history, staged_image_input)
     if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
         chat_history[-1]['content'] = "📸 Preparing camera capture..."
     return "", chat_history, staged_image_input, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value="📸 Capturing in 3 seconds...", interactive=False), gr.update(value="➕")
 # --- GRADIO INTERFACE ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
+    # --- State Components ---
     stop_signal = gr.State(value=False)
+    is_voice_chat = gr.State(value=False)
+    staged_image = gr.State(value=None)
     menu_visible_state = gr.State(value=False)
     gr.HTML("<h1 style='text-align: center; color: #4B0082;'>🌙 Luna Chat Space</h1>")
+    hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
+    file_download_output = gr.File(label="Generated File", visible=False)
     with gr.Row(visible=False) as fact_check_btn_row:
         gr.Column(min_width=1); btn_fact_check = gr.Button("Fact Check 🔎"); gr.Column(min_width=1)
+    chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
     with gr.Row(visible=False) as webcam_capture_row:
         webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
         close_webcam_btn = gr.Button("✅ Use this image")
     with gr.Row(visible=False) as audio_record_row:
         audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
     with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
+        file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
         btn_take_photo = gr.Button("📸 Google Lens (Take Photo)")
         btn_add_files = gr.Button("📎 Upload File")
         txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
         mic_btn = gr.Button("🎙️", interactive=True, size="sm")
         combined_btn = gr.Button("✈️", variant="primary", size="sm")
+    audio_output = gr.Audio(visible=False)
     output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
     # --- WIRE EVENTS ---
+    btn_menu.click(
+        fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False
+    )
     def prepare_file_upload(): return gr.update(visible=False), gr.update(value="➕"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
     btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
+    file_input.change(
+        fn=stage_file_upload, inputs=[file_input], outputs=[staged_image, hint_box, txt, file_input], queue=False
+    )
+    btn_take_photo.click(
+        fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "📸 Camera Active. Capture an image.", gr.update(value="➕")),
+        inputs=[], outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu], queue=False
+    )
     close_webcam_btn.click(
         fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"📸 Photo staged: Click send (✈️).", gr.update(value="")),
+        inputs=[webcam_capture_component], outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt], queue=False
     )
+    mic_btn.click(
+        fn=lambda: (gr.update(visible=False), gr.update(visible=True), "🎙️ Recording..."),
+        inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False
     ).then(
+        fn=simulate_recording_delay, inputs=[], outputs=[], queue=False
+    ).then(
+        fn=lambda: (gr.update(visible=True), gr.update(visible=False), "🎙️ Processing recording..."),
+        inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False
+    ).then(
+        fn=transcribe_audio, inputs=audio_input, outputs=[txt, hint_box, txt, combined_btn, is_voice_chat, fact_check_btn_row], queue=False
+    ).then(
+        fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
+    ).then(
+        fn=chat_generator, inputs=[txt, staged_image, chatbot, stop_signal, is_voice_chat], outputs=output_components, queue=True
+    ).then(
+        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
     )
     generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
+    # Text submit (Enter key)
     txt.submit(
+        fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
     ).then(
+        fn=chat_generator, inputs=generator_inputs, outputs=output_components, queue=True
     ).then(
+        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
     )
+    # Send button click
     combined_btn.click(
+        fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
     ).then(
+        fn=chat_generator, inputs=generator_inputs, outputs=output_components, queue=True
     ).then(
+        fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
+    )
+    btn_fact_check.click(
+        fn=manual_fact_check, inputs=[chatbot], outputs=[chatbot, hint_box, fact_check_btn_row], queue=True
     )
+demo.queue(max_size=20).launch(server_name="0.0.0.0")