cosmosai471 commited on
Commit
4f7f656
Β·
verified Β·
1 Parent(s): b27fc81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -395
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import os
3
  import time
@@ -9,41 +10,27 @@ from llama_cpp import Llama
9
  from typing import List, Dict, Any, Tuple
10
  from PIL import Image
11
  from transformers import pipeline
12
- from gtts import gTTS
13
  from diffusers import StableDiffusionPipeline
14
- from docx import Document
15
- from pptx import Presentation
16
- from io import BytesIO
 
17
 
18
- # --- CONFIGURATION & INITIALIZATION ---
19
- # Set device for pipelines (STT/VQA/ImageGen). Use "cpu" for compatibility.
20
- STT_DEVICE = "cpu"
21
  os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
22
  AUDIO_DIR = "audio_outputs"
23
- DOC_DIR = "doc_outputs"
24
  if not os.path.exists(AUDIO_DIR):
25
  os.makedirs(AUDIO_DIR)
26
  if not os.path.exists(DOC_DIR):
27
  os.makedirs(DOC_DIR)
28
-
29
- # Hugging Face Model Info
30
  REPO_ID = "cosmosai471/Luna-v3"
31
  MODEL_FILE = "luna.gguf"
32
- LOCAL_MODEL_PATH = MODEL_FILE
33
-
34
- # FIX: Updated SYSTEM PROMPT for Confidence Scoring
35
  SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
36
-
37
- # Helper to safely delete Llama instance (prevents resource leaks)
38
- def safe_del(self):
39
- try:
40
- if hasattr(self, "close") and callable(self.close):
41
- self.close()
42
- except Exception:
43
- pass
44
- Llama.__del__ = safe_del
45
-
46
- # --- MODEL LOADING ---
47
  llm = None
48
  try:
49
  print(f"Downloading {MODEL_FILE} from {REPO_ID}...")
@@ -54,10 +41,10 @@ try:
54
  print("Initializing Llama...")
55
  llm = Llama(
56
  model_path=LOCAL_MODEL_PATH,
57
- n_ctx=8192,
58
- n_threads=4,
59
- n_batch=256,
60
- n_gpu_layers=0,
61
  verbose=False
62
  )
63
  print("βœ… Luna Model loaded successfully!")
@@ -65,11 +52,9 @@ except Exception as e:
65
  print(f"❌ Error loading Luna model: {e}")
66
  class DummyLLM:
67
  def create_completion(self, *args, **kwargs):
68
- # Must match the new prompt format to avoid parsing errors
69
  yield {'choices': [{'text': '[Intent: qa_general][Confidence: 0] ERROR: Luna model failed to load. Check logs and resources.'}]}
70
  llm = DummyLLM()
71
 
72
- # --- MULTIMODAL PIPELINE LOADING ---
73
  stt_pipe = None
74
  try:
75
  stt_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=STT_DEVICE)
@@ -79,7 +64,7 @@ except Exception as e:
79
 
80
  image_pipe = None
81
  try:
82
- VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
83
  image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
84
  print(f"βœ… Loaded {VLM_MODEL_ID} for image processing.")
85
  except Exception as e:
@@ -88,148 +73,139 @@ except Exception as e:
88
  img_gen_pipe = None
89
  try:
90
  img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
91
- img_gen_pipe.to(STT_DEVICE)
92
  print("βœ… Loaded Stable Diffusion (v1-5) for image generation.")
93
  except Exception as e:
94
  print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
95
 
96
-
97
  # --- UTILITY FUNCTIONS ---
98
 
99
  def simulate_recording_delay():
100
- """Simulates a 3-second recording time for the UI flow."""
101
  time.sleep(3)
102
- return None
103
 
104
  def clean_response_stream(raw_text: str) -> str:
105
- """Cleans up raw LLaMA-style output and removes repeats, and removes tags."""
106
- # 1. Strip stop tokens
107
  clean_text = re.split(r'\nUser:|\nAssistant:|</s>|Intent|Action', raw_text, 1)[0].strip()
108
-
109
- # 2. Remove instruction/action markers and new Confidence/Intent tags
110
  clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
 
111
  clean_text = re.sub(r'\[Intent:\s*\w+\]|\[Confidence:\s*\d+\]', '', clean_text).strip()
112
-
113
- # 3. Simple word-repeat check
114
  words = clean_text.split()
115
- if len(words) > 4 and words[-2:] == words[-4:-2]:
116
  clean_text = ' '.join(words[:-2])
117
-
118
  return clean_text
119
 
120
  def web_search_tool(query: str) -> str:
121
- """Simulated Google Search Fallback."""
122
- time.sleep(1.5)
123
  print(f"Simulating Google Search fallback for: {query}")
124
  return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
125
 
126
- # FIX: Updated confidence check
127
  def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
128
  """
129
- Checks the model's self-reported confidence from the *raw* response.
130
- Triggers fallback if low.
131
  """
132
- # 1. Parse Confidence Score from the raw, unprocessed response
133
  confidence_match = re.search(r'\[Confidence:\s*(\d+)\]', raw_response_with_tags)
134
  confidence_score = int(confidence_match.group(1)) if confidence_match else 0
135
-
136
- # 2. Clean the response *after* parsing confidence
137
  cleaned_response = clean_response_stream(raw_response_with_tags)
138
-
139
- # 3. Check if confidence is below threshold
140
- if confidence_score < 33:
141
  print(f"Low confidence ({confidence_score}%) detected. Triggering Google Search fallback.")
142
  search_snippet = web_search_tool(prompt)
143
-
144
  if "error" in cleaned_response.lower() or confidence_score == 0:
145
  final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
146
  else:
 
147
  final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
148
  else:
149
- # Confidence is high, just return the cleaned response
150
  final_response = cleaned_response
151
-
152
  return final_response
153
 
154
- # FIX: Updated image processing with correct VQA prompt
155
- def process_image(image_data_or_path: Any, message: str) -> str:
156
- """Uses the VLM pipeline (LLaVA) for Visual Question Answering (VQA)."""
 
 
 
157
  global image_pipe
158
-
159
  if image_pipe is None:
160
- return f"[Image Processing Error: VLM model is not loaded.] **User Query:** {message}"
161
 
162
  image = None
163
  try:
164
  if isinstance(image_data_or_path, str):
165
  image = Image.open(image_data_or_path).convert("RGB")
166
- elif image_data_or_path is not None:
167
  image = Image.fromarray(image_data_or_path).convert("RGB")
168
-
169
  if image:
170
- # FIX: Use the special <image> token for the llava-1.5-hf pipeline
171
  vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
172
-
173
- results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 768})
174
- # The VLM's *full* response is in 'generated_text', including the prompt
175
- raw_vlm_output = results[0]['generated_text'] if results else "The image could not be processed."
 
176
  # Extract just the assistant's part
177
  vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
 
 
 
178
  del image
179
-
180
- # This is the VQA analysis that will be fed into Luna
181
  prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
182
- return prompt_injection
183
-
184
  except Exception as e:
185
  print(f"Image Pipeline Error: {e}")
186
- return f"[Image Processing Error: {e}] **User Query:** {message}"
187
-
188
- return message
189
 
 
 
 
 
 
190
  def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
191
- """Transcribes audio file using Whisper."""
192
  if stt_pipe is None or audio_file_path is None:
193
  error_msg = "Error: Whisper model failed to load or no audio recorded."
194
  return "", error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), False, gr.update(visible=False)
195
-
196
  try:
197
  transcribed_text = stt_pipe(audio_file_path)["text"]
198
  new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
199
-
200
  return (
201
- transcribed_text.strip(),
202
- f"πŸŽ™οΈ Transcribed: '{transcribed_text.strip()}'",
203
- gr.update(interactive=True),
204
- new_button_update,
205
- True,
206
  gr.update(visible=False)
207
  )
208
  except Exception as e:
209
  error_msg = f"Transcription Error: {e}"
210
  return "", error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), False, gr.update(visible=False)
211
 
212
-
213
  def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
214
- """Converts the final response text to an MP3 file using gTTS."""
215
  if not is_voice_chat:
216
- return None
217
-
218
- clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$', '', text, flags=re.DOTALL)
219
-
220
  if len(clean_text.strip()) > 5:
221
  try:
222
  audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
223
  tts = gTTS(text=clean_text.strip(), lang='en')
224
  tts.save(audio_output_path)
225
- return audio_output_path
226
  except Exception as e:
227
  print(f"gTTS Error: {e}")
228
  return None
229
  return None
230
 
231
-
232
- # Intent and Dynamic Hint Logic
233
  INTENT_STATUS_MAP = {
234
  "code_generate": "Analyzing requirements and drafting code πŸ’»...",
235
  "code_explain": "Reviewing code logic and writing explanation πŸ’‘...",
@@ -244,494 +220,375 @@ INTENT_STATUS_MAP = {
244
  "default": "Luna is thinking...",
245
  }
246
 
247
- def get_intent_status(raw_response: str, is_vqa: bool) -> Tuple[str, str, str]:
248
- """Parses the Intent tag from the model's raw response and returns the intent, status, and cleaned response."""
249
-
 
 
250
  # 1. Parse Intent
251
  match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
252
  intent = match.group(1).lower() if match else "default"
253
-
254
- # If it was a VQA flow (image was sent), we force the VQA intent
255
- # This ensures the VQA tool override works even if Luna misidentifies the intent
256
- if is_vqa:
257
  intent = "vqa"
258
-
259
  # 2. Clean Text (remove both tags for display)
260
  cleaned_text = re.sub(r'\[Intent:\s*\w+\]\s*', '', raw_response, count=1).strip()
261
  cleaned_text = re.sub(r'\[Confidence:\s*\d+\]\s*', '', cleaned_text, count=1).strip()
262
-
263
  # 3. Get Status
264
  status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
265
  return intent, status, cleaned_text
266
 
267
 
268
- # --- NEW GENERATOR FUNCTIONS FOR UPGRADES ---
269
-
270
  def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
271
  """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
272
  file_path = None
273
-
274
  try:
275
  if file_type == "image":
276
- if img_gen_pipe is None:
277
- raise RuntimeError("Image generation model is not loaded.")
278
-
279
  image = img_gen_pipe(content).images[0]
280
  file_filename = f"generated_img_{random.randint(1000, 9999)}.png"
281
  file_path = os.path.join(DOC_DIR, file_filename)
282
  image.save(file_path)
283
-
284
- # FIX: Format output as Gradio markdown file link
285
  display_content = f"πŸ–ΌοΈ **Image Generated!**\n\n[Download {file_filename}](file={file_path})"
286
-
287
  elif file_type == "doc":
288
  doc = Document()
289
  doc.add_heading('Luna Generated Document', 0)
290
  doc.add_paragraph(content)
291
-
292
  file_filename = f"generated_doc_{random.randint(1000, 9999)}.docx"
293
  file_path = os.path.join(DOC_DIR, file_filename)
294
  doc.save(file_path)
295
-
296
  display_content = f"πŸ“„ **Document Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
297
-
298
  elif file_type == "ppt":
299
  prs = Presentation()
300
- title_slide_layout = prs.slide_layouts[0]
301
- slide = prs.slides.add_slide(title_slide_layout)
302
  slide.shapes.title.text = "Luna Generated Presentation"
303
- subtitle = slide.placeholders[1]
304
- subtitle.text = content[:100] + "..."
305
-
306
  file_filename = f"generated_ppt_{random.randint(1000, 9999)}.pptx"
307
  file_path = os.path.join(DOC_DIR, file_filename)
308
  prs.save(file_path)
309
-
310
  display_content = f"πŸ“Š **Presentation Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
311
-
312
  else:
313
  raise ValueError(f"Unknown file type: {file_type}")
314
-
315
- # Update the history with the markdown link
316
  history[-1]['content'] = display_content
317
-
318
  except Exception as e:
319
- error_msg = f"❌ **Error generating {file_type.upper()}:** {e}. Please check model loading or library installation."
320
  history[-1]['content'] = error_msg
321
- file_path = None # Ensure path is None on failure
322
-
323
  return history, file_path
324
 
325
-
326
  # --- CORE GENERATOR FUNCTION ---
327
-
328
- def chat_generator(message: str, image_path: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
329
- """The main generator function for streaming the LLM response."""
330
-
331
  # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
332
-
333
  # 1. INITIAL HISTORY CHECK
334
  if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
335
- yield history, False, "Error: Generator called without a recent user message in history.", gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), None, False, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
336
  return
337
 
338
  # 2. PRE-PROCESSING & CONTEXT
339
  last_user_index = len(history) - 2
340
- original_message = history[last_user_index]['content']
341
 
342
- # FIX: Robust check for image/file presence.
343
- # This avoids the ValueError: `array == ""`
344
  is_vqa_flow = False
345
- if isinstance(image_path, str):
346
- is_vqa_flow = image_path != ""
347
- else:
348
- # It's not a string, so if it's not None, it's image data (e.g., numpy array)
349
- is_vqa_flow = image_path is not None
350
-
 
 
351
  if is_vqa_flow:
352
  # Process image/VQA
353
- message = process_image(image_path, original_message)
354
- # Update the user's content to reflect VQA flow for context building
355
  history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
 
 
356
  else:
357
- message = original_message
358
- image_path = None # Clear image_path for final yield
359
 
360
- # Build the prompt with conversation history (Context)
361
  prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
362
-
363
- for i, item in enumerate(history[:-1]):
364
  role = item['role'].upper()
365
  content = item['content'] if item['content'] is not None else ""
366
-
367
- if role == "ASSISTANT":
368
- prompt += f"LUNA: {content}\n"
369
- elif role == "USER":
370
- prompt += f"USER: {content}\n"
371
-
372
- prompt += f"USER: {message}\nLUNA: "
373
-
374
- # 3. HINT BOX & STREAM START
375
- hint_text = "✨ Luna is starting to think..."
376
 
377
- history[-1]['content'] = ""
378
- yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
379
- time.sleep(0.5)
 
 
380
 
381
  # 4. DIRECT STREAMING
382
  full_response = ""
383
- current_intent = "default"
384
-
385
  try:
386
  stream = llm.create_completion(
387
- prompt=prompt,
388
- max_tokens=8192,
389
  stop=["USER:", "SYSTEM:", "</s>"],
390
- echo=False,
391
- stream=True,
392
- temperature=0.7
393
  )
394
  except Exception as e:
395
  error_text = f"❌ Error generating response: {e}"
396
  history[-1]['content'] = error_text
397
- yield history, False, error_text, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), None, False, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
398
  return
399
 
400
  try:
401
  for output in stream:
402
  token = output["choices"][0].get("text", "")
403
  full_response += token
404
-
405
- # Get intent and cleaned text for display
406
- current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow)
407
-
408
- # Update the last assistant message's content
409
- history[-1]['content'] = display_text
410
-
411
- # Yield continuous update
412
- yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
413
-
414
  except Exception as e:
415
- _, _, final_response_text = get_intent_status(full_response, is_vqa_flow)
416
  error_msg = f"⚠️ Streaming interrupted: {e}"
417
- history[-1]['content'] = final_response_text
418
- yield history, False, error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), None, False, gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None)
419
  return
420
 
421
  # 5. POST-PROCESSING & TOOL EXECUTION
422
- # We use the *full_response* (with tags) for confidence check
423
- # We use the *current_intent* (parsed during stream) for tool logic
424
-
425
  file_download_path = None
426
-
427
- # 5a. File Generation/Tool Action
 
428
  if current_intent == "image_generate":
429
- yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
430
- _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow)
431
  history, file_download_path = generate_file_content(content_for_tool, history, "image")
432
- final_response = history[-1]['content']
433
-
434
  elif current_intent == "doc_generate":
435
- yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
436
- _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow)
437
  history, file_download_path = generate_file_content(content_for_tool, history, "doc")
438
- final_response = history[-1]['content']
439
-
440
  elif current_intent == "ppt_generate":
441
- yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True, elem_classes=["circle-btn", "stop-mode"]), None, is_voice_chat, gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None)
442
- _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow)
443
  history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
444
- final_response = history[-1]['content']
445
-
446
  elif current_intent == "open_google":
447
- _, _, final_response = get_intent_status(full_response, is_vqa_flow)
448
- final_response += "\n\nπŸ”— **Action:** Since I cannot open a window for you, click here to search Google for this topic: [Google Search Link](https://www.google.com/search?q=open+google+simulated+search)"
449
-
450
  elif current_intent == "open_camera":
451
- _, _, final_response = get_intent_status(full_response, is_vqa_flow)
452
- final_response += "\n\nπŸ“Έ **Action:** I cannot directly open the camera within this chat stream, but I will prepare the UI for you to use the 'Google Lens' button if you click 'Send' now!"
453
-
454
- # 5b. Confidence Check and Augmentation (Bypassed by Intent)
455
- TOOL_EXECUTION_INTENTS = [
456
- "image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"
457
- ]
458
-
459
- # FIX: Check if intent is NOT a tool intent, then check confidence
460
- if current_intent not in TOOL_EXECUTION_INTENTS:
461
- # We pass the *full_response* (with tags) to the confidence checker
462
- final_response = check_confidence_and_augment(full_response, original_message)
463
  else:
464
- # If it *is* a tool intent, we just clean the response (unless it was already cleaned by a file generator)
465
- if file_download_path is None:
466
- _, _, final_response = get_intent_status(full_response, is_vqa_flow)
467
-
468
  # 5c. TTS Generation
469
- audio_file_path = text_to_audio(final_response, is_voice_chat)
470
-
471
- # 5d. Final History Update
472
- history[-1]['content'] = final_response
473
-
474
  # 6. FINAL YIELD
475
- hint = "βœ… Response generated."
476
-
477
- # We clear the staged image here by outputting None to its state component
478
- yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(value=None), file_download_path
 
479
 
480
  # --- GRADIO WRAPPERS FOR UI ACTIONS ---
481
 
482
  def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
483
- """Toggles the visibility of the media options menu."""
484
- new_visibility = not current_visibility
485
  return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "βž•")
486
 
487
- def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image: Any) -> Tuple[str, List[Dict[str, str]], Any]:
 
488
  """
489
- Appends the user message to the chat history and clears the input box.
490
- Crucially, it only adds a message if there is text OR a staged image.
491
- It also clears the staged image *from the state* immediately, so it's not sticky.
492
  """
493
-
494
- # Check if there is any input (text or image)
495
  has_text = bool(user_message)
496
- # Check for image (robustly)
497
  has_image = False
498
- if isinstance(staged_image, str):
499
- has_image = staged_image != ""
 
 
500
  else:
501
- has_image = staged_image is not None
502
-
503
- # If no text AND no image, do nothing.
504
  if not has_text and not has_image:
505
- return "", chat_history, staged_image # Return original state
506
 
507
- # If the last message is an incomplete assistant message, wait for it to finish.
508
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
509
- return user_message, chat_history, staged_image # Return original state, don't clear text yet
510
 
511
- # We have a valid submission (text, image, or both)
512
-
513
- # If no text was provided but an image was, create a default message
514
  if not has_text and has_image:
515
- user_message = "Analyzing Staged Media."
516
-
517
- chat_history.append({"role": "user", "content": user_message})
518
- chat_history.append({"role": "assistant", "content": ""})
519
-
520
- # FIX: Clear the text box and CLEAR THE STAGED IMAGE STATE
521
- return "", chat_history, None
 
 
 
522
 
523
  def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
524
- """Stages the file path and updates the hint box."""
525
  if file_path:
526
- # file_path is a string path here
527
- return file_path, f"πŸ“Ž File staged: {os.path.basename(file_path)}. Click send (✈️) to analyze.", gr.update(value="", interactive=True), gr.update(interactive=False)
528
- return None, "File upload cancelled/cleared.", gr.update(value="", interactive=True), gr.update(interactive=False)
529
 
 
530
  def clear_staged_media() -> gr.update:
531
- """Clears the staged media state after sending or canceling."""
532
- # This function is now redundant because user_turn handles clearing,
533
- # but we will keep it for the explicit .then() calls if needed, though they are removed.
534
  return gr.update(value=None)
535
 
 
536
  def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
537
- """Triggers a manual fact check/web search, using the 'messages' format."""
538
  if not history or not history[-1]['content']:
539
  return history, "Error: No final response to check.", gr.update(visible=False)
540
-
541
  last_user_prompt = ""
542
  for item in reversed(history):
543
  if item['role'] == 'user' and item['content']:
544
  last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
545
  break
546
-
547
- if not last_user_prompt:
548
- return history, "Error: Could not find the original user query.", gr.update(visible=False)
549
-
550
  web_results = web_search_tool(last_user_prompt)
551
-
552
  new_history = list(history)
553
  new_history[-1]['content'] += web_results
554
-
555
  return new_history, "βœ… Double-checked with web facts.", gr.update(visible=False)
556
 
557
- def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], staged_image: Any) -> Tuple[str, List[Dict[str, str]], Any, gr.update, gr.update, gr.update, gr.update, gr.update]:
558
- """
559
- Simulates the automatic capture action by updating the UI components
560
- to show the camera, and then immediately capturing (simulated).
561
- """
562
- _, chat_history, staged_image = user_turn(user_message, chat_history, staged_image)
563
-
564
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
565
  chat_history[-1]['content'] = "πŸ“Έ Preparing camera capture..."
566
-
567
- return "", chat_history, staged_image, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value="πŸ“Έ Capturing in 3 seconds...", interactive=False), gr.update(value="βž•")
 
568
 
569
 
570
  # --- GRADIO INTERFACE ---
571
 
572
  with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
573
-
574
- # --- State Components ---
575
  stop_signal = gr.State(value=False)
576
- is_voice_chat = gr.State(value=False)
577
- staged_image = gr.State(value=None)
578
  menu_visible_state = gr.State(value=False)
579
-
580
  gr.HTML("<h1 style='text-align: center; color: #4B0082;'>πŸŒ™ Luna Chat Space</h1>")
 
 
581
 
582
- # Hint Box
583
- hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
584
 
585
- # FIX: File Download Box is now INVISIBLE. Downloads will appear in chat.
586
- file_download_output = gr.File(label="Generated File", visible=False)
587
 
588
- # Fact Check button row
589
- with gr.Row(visible=False) as fact_check_btn_row:
590
- gr.Column(min_width=1)
591
- btn_fact_check = gr.Button("Fact Check πŸ”Ž")
592
- gr.Column(min_width=1)
593
-
594
- # Chatbot Area
595
- chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
596
-
597
- # Webcam Capture Area (Hidden)
598
  with gr.Row(visible=False) as webcam_capture_row:
599
- # type="numpy" ensures raw data is passed
600
  webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
601
  close_webcam_btn = gr.Button("βœ… Use this image")
602
-
603
- # Audio Recording Row (Hidden)
604
  with gr.Row(visible=False) as audio_record_row:
605
  audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
606
-
607
- # Option Menu (Hidden)
608
  with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
609
- # type="filepath" ensures a path string is passed
610
- file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
611
  btn_take_photo = gr.Button("πŸ“Έ Google Lens (Take Photo)")
612
  btn_add_files = gr.Button("πŸ“Ž Upload File")
613
 
614
- # Fixed Input Row (Footer)
615
  with gr.Row(variant="panel") as input_row:
616
  btn_menu = gr.Button("βž•", interactive=True, size="sm")
617
  txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
618
  mic_btn = gr.Button("πŸŽ™οΈ", interactive=True, size="sm")
619
  combined_btn = gr.Button("✈️", variant="primary", size="sm")
620
-
621
- audio_output = gr.Audio(visible=False)
622
 
623
- # Group all output components for convenience
 
 
624
  output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
625
 
626
  # --- WIRE EVENTS ---
627
 
628
- # 1. Menu Button
629
- btn_menu.click(
630
- fn=toggle_menu,
631
- inputs=[menu_visible_state],
632
- outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu],
633
- queue=False
634
- )
635
-
636
- # 2. File Upload
637
- def prepare_file_upload():
638
- return gr.update(visible=False), gr.update(value="βž•"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
639
-
640
  btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
641
-
642
- file_input.change(
643
- fn=stage_file_upload,
644
- inputs=[file_input],
645
- outputs=[staged_image, hint_box, txt, file_input],
646
- queue=False
647
- )
648
 
649
- # 3. 'Take photo' (Webcam)
650
- btn_take_photo.click(
651
- fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "πŸ“Έ Camera Active. Capture an image.", gr.update(value="βž•")),
652
- inputs=[],
653
- outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu],
654
- queue=False
655
- )
656
-
657
- # 4. Webcam Close
658
  close_webcam_btn.click(
659
- fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"πŸ“Έ Photo staged: Click send (✈️) to process.", gr.update(value="")),
660
  inputs=[webcam_capture_component],
661
- outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt],
662
  queue=False
663
  )
664
-
665
- # 5. Mic wiring
666
- mic_btn.click(
667
- fn=lambda: (gr.update(visible=False), gr.update(visible=True), "πŸŽ™οΈ Recording..."),
668
- inputs=[],
669
- outputs=[input_row, audio_record_row, hint_box],
670
- queue=False
671
- ).then(
672
- fn=simulate_recording_delay,
673
- inputs=[],
674
- outputs=[],
675
- queue=False,
676
- ).then(
677
- fn=lambda: (gr.update(visible=True), gr.update(visible=False), "πŸŽ™οΈ Processing recording..."),
678
- inputs=[],
679
- outputs=[input_row, audio_record_row, hint_box],
680
- queue=False,
681
- ).then(
682
- fn=transcribe_audio,
683
- inputs=audio_input,
684
- outputs=[txt, hint_box, txt, combined_btn, is_voice_chat, fact_check_btn_row],
685
- queue=False
686
- ).then(
687
- fn=user_turn,
688
- inputs=[txt, chatbot, staged_image],
689
- outputs=[txt, chatbot, staged_image],
690
- queue=False
691
- ).then(
692
- fn=chat_generator,
693
- inputs=[txt, staged_image, chatbot, stop_signal, is_voice_chat],
694
  outputs=output_components,
695
  queue=True,
 
 
696
  )
697
 
698
- # 6. Main Submission Logic (Text submit and Send button)
 
 
 
699
  generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
700
-
701
- # Text submit (Enter key)
702
  txt.submit(
703
- # FIX: user_turn now also takes staged_image as input and output
704
  fn=user_turn,
705
- inputs=[txt, chatbot, staged_image],
706
- outputs=[txt, chatbot, staged_image],
707
  queue=False
708
  ).then(
709
- fn=chat_generator,
710
- inputs=generator_inputs,
711
- outputs=output_components,
712
  queue=True,
 
 
713
  )
714
-
715
- # Send button click
716
  combined_btn.click(
717
- # FIX: user_turn now also takes staged_image as input and output
718
  fn=user_turn,
719
- inputs=[txt, chatbot, staged_image],
720
- outputs=[txt, chatbot, staged_image],
721
  queue=False
722
  ).then(
723
  fn=chat_generator,
724
- inputs=generator_inputs,
725
  outputs=output_components,
726
- queue=True
727
- )
728
-
729
- # 7. Fact Check Button
730
- btn_fact_check.click(
731
- fn=manual_fact_check,
732
- inputs=[chatbot],
733
- outputs=[chatbot, hint_box, fact_check_btn_row],
734
- queue=True
735
  )
736
 
737
- demo.queue(max_size=20).launch(server_name="0.0.0.0")
 
 
 
 
1
+ # --- (Import statements remain the same) ---
2
  import gradio as gr
3
  import os
4
  import time
 
10
  from typing import List, Dict, Any, Tuple
11
  from PIL import Image
12
  from transformers import pipeline
13
+ from gtts import gTTS
14
  from diffusers import StableDiffusionPipeline
15
+ from docx import Document
16
+ from pptx import Presentation
17
+ from io import BytesIO
18
+ import numpy as np # <-- Import NumPy for robust image check
19
 
20
+ # --- (CONFIGURATIONS & MODEL LOADING remain the same) ---
21
+ STT_DEVICE = "cpu"
 
22
  os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
23
  AUDIO_DIR = "audio_outputs"
24
+ DOC_DIR = "doc_outputs"
25
  if not os.path.exists(AUDIO_DIR):
26
  os.makedirs(AUDIO_DIR)
27
  if not os.path.exists(DOC_DIR):
28
  os.makedirs(DOC_DIR)
 
 
29
  REPO_ID = "cosmosai471/Luna-v3"
30
  MODEL_FILE = "luna.gguf"
31
+ LOCAL_MODEL_PATH = MODEL_FILE
 
 
32
  SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
33
+ # --- (safe_del, LLM loading, Pipeline loading remain the same) ---
 
 
 
 
 
 
 
 
 
 
34
  llm = None
35
  try:
36
  print(f"Downloading {MODEL_FILE} from {REPO_ID}...")
 
41
  print("Initializing Llama...")
42
  llm = Llama(
43
  model_path=LOCAL_MODEL_PATH,
44
+ n_ctx=8192,
45
+ n_threads=4,
46
+ n_batch=256,
47
+ n_gpu_layers=0,
48
  verbose=False
49
  )
50
  print("βœ… Luna Model loaded successfully!")
 
52
  print(f"❌ Error loading Luna model: {e}")
53
  class DummyLLM:
54
  def create_completion(self, *args, **kwargs):
 
55
  yield {'choices': [{'text': '[Intent: qa_general][Confidence: 0] ERROR: Luna model failed to load. Check logs and resources.'}]}
56
  llm = DummyLLM()
57
 
 
58
  stt_pipe = None
59
  try:
60
  stt_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=STT_DEVICE)
 
64
 
65
  image_pipe = None
66
  try:
67
+ VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
68
  image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
69
  print(f"βœ… Loaded {VLM_MODEL_ID} for image processing.")
70
  except Exception as e:
 
73
  img_gen_pipe = None
74
  try:
75
  img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
76
+ img_gen_pipe.to(STT_DEVICE)
77
  print("βœ… Loaded Stable Diffusion (v1-5) for image generation.")
78
  except Exception as e:
79
  print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
80
 
 
81
  # --- UTILITY FUNCTIONS ---
82
 
83
  def simulate_recording_delay():
 
84
  time.sleep(3)
85
+ return None
86
 
87
  def clean_response_stream(raw_text: str) -> str:
88
+ """Cleans up raw response text by removing tags and repeats."""
 
89
  clean_text = re.split(r'\nUser:|\nAssistant:|</s>|Intent|Action', raw_text, 1)[0].strip()
 
 
90
  clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
91
+ # Remove Intent and Confidence tags specifically for display
92
  clean_text = re.sub(r'\[Intent:\s*\w+\]|\[Confidence:\s*\d+\]', '', clean_text).strip()
 
 
93
  words = clean_text.split()
94
+ if len(words) > 4 and words[-2:] == words[-4:-2]:
95
  clean_text = ' '.join(words[:-2])
 
96
  return clean_text
97
 
98
  def web_search_tool(query: str) -> str:
99
+ time.sleep(1.5)
 
100
  print(f"Simulating Google Search fallback for: {query}")
101
  return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
102
 
103
+ # FIX: Confidence check operates on RAW response string
104
  def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
105
  """
106
+ Checks confidence from the raw response tag. Triggers fallback if low.
107
+ Returns the *cleaned* response (or augmented one).
108
  """
 
109
  confidence_match = re.search(r'\[Confidence:\s*(\d+)\]', raw_response_with_tags)
110
  confidence_score = int(confidence_match.group(1)) if confidence_match else 0
111
+
112
+ # Always clean the response *after* parsing confidence
113
  cleaned_response = clean_response_stream(raw_response_with_tags)
114
+
115
+ if confidence_score < 70:
 
116
  print(f"Low confidence ({confidence_score}%) detected. Triggering Google Search fallback.")
117
  search_snippet = web_search_tool(prompt)
 
118
  if "error" in cleaned_response.lower() or confidence_score == 0:
119
  final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
120
  else:
121
+ # Append search results to the existing (low confidence) cleaned response
122
  final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
123
  else:
124
+ # High confidence, return the already cleaned response
125
  final_response = cleaned_response
126
+
127
  return final_response
128
 
129
+ # FIX: Correct VQA prompt format and error handling
130
+ def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
131
+ """
132
+ Uses the VLM pipeline (LLaVA) for VQA.
133
+ Returns the prompt injection string and a boolean indicating success.
134
+ """
135
  global image_pipe
136
+ success = False
137
  if image_pipe is None:
138
+ return f"[Image Processing Error: VLM model is not loaded.] **User Query:** {message}", success
139
 
140
  image = None
141
  try:
142
  if isinstance(image_data_or_path, str):
143
  image = Image.open(image_data_or_path).convert("RGB")
144
+ elif isinstance(image_data_or_path, np.ndarray): # Handle NumPy array from webcam
145
  image = Image.fromarray(image_data_or_path).convert("RGB")
146
+
147
  if image:
148
+ # FIX: Use the specific format required by llava-hf/llava-1.5-7b-hf
149
  vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
150
+
151
+ # Increased max_new_tokens for potentially longer VQA responses
152
+ results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
153
+ raw_vlm_output = results[0]['generated_text'] if results else "Error: VLM did not return text."
154
+
155
  # Extract just the assistant's part
156
  vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
157
+ if not vqa_response: # Handle case where split fails or response is empty
158
+ vqa_response = "VLM analysis failed or returned empty."
159
+
160
  del image
161
+ success = True
 
162
  prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
163
+ return prompt_injection, success
164
+
165
  except Exception as e:
166
  print(f"Image Pipeline Error: {e}")
167
+ return f"[Image Processing Error: {e}] **User Query:** {message}", success
 
 
168
 
169
+ # If image processing failed before VLM call
170
+ return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
171
+
172
+
173
+ # --- (transcribe_audio, text_to_audio remain the same) ---
174
  def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
 
175
  if stt_pipe is None or audio_file_path is None:
176
  error_msg = "Error: Whisper model failed to load or no audio recorded."
177
  return "", error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), False, gr.update(visible=False)
 
178
  try:
179
  transcribed_text = stt_pipe(audio_file_path)["text"]
180
  new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
 
181
  return (
182
+ transcribed_text.strip(),
183
+ f"πŸŽ™οΈ Transcribed: '{transcribed_text.strip()}'",
184
+ gr.update(interactive=True),
185
+ new_button_update,
186
+ True,
187
  gr.update(visible=False)
188
  )
189
  except Exception as e:
190
  error_msg = f"Transcription Error: {e}"
191
  return "", error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"]), False, gr.update(visible=False)
192
 
 
193
  def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
 
194
  if not is_voice_chat:
195
+ return None
196
+ clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$|\(file=.*?\)', '', text, flags=re.DOTALL | re.MULTILINE) # Also remove file links for TTS
 
 
197
  if len(clean_text.strip()) > 5:
198
  try:
199
  audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
200
  tts = gTTS(text=clean_text.strip(), lang='en')
201
  tts.save(audio_output_path)
202
+ return audio_output_path
203
  except Exception as e:
204
  print(f"gTTS Error: {e}")
205
  return None
206
  return None
207
 
208
+ # --- (INTENT_STATUS_MAP remains the same) ---
 
209
  INTENT_STATUS_MAP = {
210
  "code_generate": "Analyzing requirements and drafting code πŸ’»...",
211
  "code_explain": "Reviewing code logic and writing explanation πŸ’‘...",
 
220
  "default": "Luna is thinking...",
221
  }
222
 
223
+
224
+ # FIX: Updated get_intent_status to force VQA intent more reliably
225
+ def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
226
+ """Parses intent/confidence, returns intent, status, cleaned text."""
227
+
228
  # 1. Parse Intent
229
  match = re.search(r'\[Intent:\s*(\w+)\]', raw_response, re.IGNORECASE)
230
  intent = match.group(1).lower() if match else "default"
231
+
232
+ # FIX: Force 'vqa' intent if the flow started with an image, regardless of model output
233
+ if is_vqa_flow:
 
234
  intent = "vqa"
235
+
236
  # 2. Clean Text (remove both tags for display)
237
  cleaned_text = re.sub(r'\[Intent:\s*\w+\]\s*', '', raw_response, count=1).strip()
238
  cleaned_text = re.sub(r'\[Confidence:\s*\d+\]\s*', '', cleaned_text, count=1).strip()
239
+
240
  # 3. Get Status
241
  status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
242
  return intent, status, cleaned_text
243
 
244
 
245
+ # --- (generate_file_content remains the same) ---
 
246
  def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
247
  """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
248
  file_path = None
 
249
  try:
250
  if file_type == "image":
251
+ if img_gen_pipe is None: raise RuntimeError("Image generation model not loaded.")
 
 
252
  image = img_gen_pipe(content).images[0]
253
  file_filename = f"generated_img_{random.randint(1000, 9999)}.png"
254
  file_path = os.path.join(DOC_DIR, file_filename)
255
  image.save(file_path)
 
 
256
  display_content = f"πŸ–ΌοΈ **Image Generated!**\n\n[Download {file_filename}](file={file_path})"
 
257
  elif file_type == "doc":
258
  doc = Document()
259
  doc.add_heading('Luna Generated Document', 0)
260
  doc.add_paragraph(content)
 
261
  file_filename = f"generated_doc_{random.randint(1000, 9999)}.docx"
262
  file_path = os.path.join(DOC_DIR, file_filename)
263
  doc.save(file_path)
 
264
  display_content = f"πŸ“„ **Document Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
 
265
  elif file_type == "ppt":
266
  prs = Presentation()
267
+ slide = prs.slides.add_slide(prs.slide_layouts[0])
 
268
  slide.shapes.title.text = "Luna Generated Presentation"
269
+ slide.placeholders[1].text = content[:100] + "..."
 
 
270
  file_filename = f"generated_ppt_{random.randint(1000, 9999)}.pptx"
271
  file_path = os.path.join(DOC_DIR, file_filename)
272
  prs.save(file_path)
 
273
  display_content = f"πŸ“Š **Presentation Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
 
274
  else:
275
  raise ValueError(f"Unknown file type: {file_type}")
 
 
276
  history[-1]['content'] = display_content
 
277
  except Exception as e:
278
+ error_msg = f"❌ **Error generating {file_type.upper()}:** {e}. Check logs/libs."
279
  history[-1]['content'] = error_msg
280
+ file_path = None
 
281
  return history, file_path
282
 
 
283
  # --- CORE GENERATOR FUNCTION ---
284
+ def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
285
+ """Main generator function for streaming LLM response."""
 
 
286
  # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
287
+
288
  # 1. INITIAL HISTORY CHECK
289
  if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
290
+ yield history, False, "Error: Generator called in unexpected state.", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
291
  return
292
 
293
  # 2. PRE-PROCESSING & CONTEXT
294
  last_user_index = len(history) - 2
295
+ original_message = history[last_user_index]['content'] # Get user msg from history
296
 
297
+ # FIX: Robust check for image/file presence using isinstance and None check.
 
298
  is_vqa_flow = False
299
+ if isinstance(image_input_data, str): # File upload path
300
+ is_vqa_flow = image_input_data != ""
301
+ elif isinstance(image_input_data, np.ndarray): # Webcam data
302
+ is_vqa_flow = image_input_data.size > 0 # Check if array is not empty
303
+ else: # Could be None or other types
304
+ is_vqa_flow = image_input_data is not None
305
+
306
+ vqa_success = False
307
  if is_vqa_flow:
308
  # Process image/VQA
309
+ processed_message, vqa_success = process_image(image_input_data, original_message)
310
+ # Update user message in history to show it was an image prompt
311
  history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
312
+ # Use the VQA-enriched message for the LLM prompt
313
+ llm_input_message = processed_message
314
  else:
315
+ llm_input_message = original_message
316
+ image_input_data = None # Ensure cleared if not VQA
317
 
318
+ # Build the final prompt string for the LLM
319
  prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
320
+ for item in history[:-1]: # Iterate through history up to the current turn
 
321
  role = item['role'].upper()
322
  content = item['content'] if item['content'] is not None else ""
323
+ if role == "ASSISTANT": prompt += f"LUNA: {content}\n"
324
+ elif role == "USER": prompt += f"USER: {content}\n"
325
+ prompt += f"USER: {llm_input_message}\nLUNA: " # Add final user input
 
 
 
 
 
 
 
326
 
327
+ # 3. HINT BOX & STREAM START
328
+ hint_text = "✨ Luna is starting to think..."
329
+ history[-1]['content'] = "" # Initialize assistant content
330
+ yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
331
+ time.sleep(0.5)
332
 
333
  # 4. DIRECT STREAMING
334
  full_response = ""
335
+ current_intent = "default"
 
336
  try:
337
  stream = llm.create_completion(
338
+ prompt=prompt, max_tokens=8192,
 
339
  stop=["USER:", "SYSTEM:", "</s>"],
340
+ echo=False, stream=True, temperature=0.7
 
 
341
  )
342
  except Exception as e:
343
  error_text = f"❌ Error generating response: {e}"
344
  history[-1]['content'] = error_text
345
+ yield history, False, error_text, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
346
  return
347
 
348
  try:
349
  for output in stream:
350
  token = output["choices"][0].get("text", "")
351
  full_response += token
352
+ # Get intent, status hint, and cleaned text for display
353
+ current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success) # Pass VQA success status
354
+ history[-1]['content'] = display_text # Update chat display
355
+ yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
 
 
 
 
 
 
356
  except Exception as e:
357
+ _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
358
  error_msg = f"⚠️ Streaming interrupted: {e}"
359
+ history[-1]['content'] = final_response_text
360
+ yield history, False, error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=True), image_input_data, gr.update(), gr.update()
361
  return
362
 
363
  # 5. POST-PROCESSING & TOOL EXECUTION
 
 
 
364
  file_download_path = None
365
+ _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success) # Get final cleaned content
366
+
367
+ # 5a. File Generation/Tool Action based on final intent
368
  if current_intent == "image_generate":
369
+ yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
 
370
  history, file_download_path = generate_file_content(content_for_tool, history, "image")
 
 
371
  elif current_intent == "doc_generate":
372
+ yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
 
373
  history, file_download_path = generate_file_content(content_for_tool, history, "doc")
 
 
374
  elif current_intent == "ppt_generate":
375
+ yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
 
376
  history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
 
 
377
  elif current_intent == "open_google":
378
+ final_cleaned_response = content_for_tool + "\n\nπŸ”— **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
379
+ history[-1]['content'] = final_cleaned_response # Update content
 
380
  elif current_intent == "open_camera":
381
+ final_cleaned_response = content_for_tool + "\n\nπŸ“Έ **Action:** Use the 'Google Lens' button to capture an image."
382
+ history[-1]['content'] = final_cleaned_response # Update content
383
+
384
+ # 5b. Confidence Check (only if NOT a tool intent)
385
+ TOOL_EXECUTION_INTENTS = ["image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"]
386
+ if current_intent not in TOOL_EXECUTION_INTENTS:
387
+ # Pass the RAW full_response (with tags) to confidence checker
388
+ final_response_content = check_confidence_and_augment(full_response, original_message)
389
+ history[-1]['content'] = final_response_content # Update content if augmented
 
 
 
390
  else:
391
+ # If it was a tool intent, the content is already set (or cleaned implicitly)
392
+ final_response_content = history[-1]['content']
393
+
 
394
  # 5c. TTS Generation
395
+ audio_file_path = text_to_audio(final_response_content, is_voice_chat)
396
+
 
 
 
397
  # 6. FINAL YIELD
398
+ hint = "βœ… Response generated."
399
+ # We yield the path to the hidden file component to make it downloadable
400
+ # We yield None to staged_image state to clear it *after* generation
401
+ yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(), file_download_path
402
+
403
 
404
  # --- GRADIO WRAPPERS FOR UI ACTIONS ---
405
 
406
  def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
407
+ new_visibility = not current_visibility
 
408
  return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "βž•")
409
 
410
+ # FIX: user_turn now only adds history if input exists, DOES NOT clear staged_image
411
+ def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
412
  """
413
+ Appends the user message to the chat history if text or image is provided.
414
+ Clears the input box. Does NOT clear the staged_image state here.
 
415
  """
 
 
416
  has_text = bool(user_message)
417
+ # Robust check for image presence
418
  has_image = False
419
+ if isinstance(staged_image_input, str):
420
+ has_image = staged_image_input != ""
421
+ elif isinstance(staged_image_input, np.ndarray):
422
+ has_image = staged_image_input.size > 0
423
  else:
424
+ has_image = staged_image_input is not None
425
+
426
+ # If no input, do nothing
427
  if not has_text and not has_image:
428
+ return user_message, chat_history # Return original inputs
429
 
430
+ # If the last turn is still generating, do nothing to prevent race conditions
431
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
432
+ return user_message, chat_history
433
 
434
+ # Determine message content
 
 
435
  if not has_text and has_image:
436
+ user_message_to_add = "Analyzing Staged Media."
437
+ else:
438
+ user_message_to_add = user_message
439
+
440
+ # Add messages to history
441
+ chat_history.append({"role": "user", "content": user_message_to_add})
442
+ chat_history.append({"role": "assistant", "content": ""}) # Add placeholder
443
+
444
+ # Clear only the text input box
445
+ return "", chat_history
446
 
447
  def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
 
448
  if file_path:
449
+ return file_path, f"πŸ“Ž File staged: {os.path.basename(file_path)}. Click send (✈️).", gr.update(value="", interactive=True), gr.update(interactive=False)
450
+ return None, "File upload cancelled.", gr.update(value="", interactive=True), gr.update(interactive=False)
 
451
 
452
+ # FIX: Reinstate clear_staged_media
453
  def clear_staged_media() -> gr.update:
454
+ """Clears the staged media state component."""
 
 
455
  return gr.update(value=None)
456
 
457
+ # --- (manual_fact_check, auto_capture_camera remain largely the same, ensure they use history format correctly) ---
458
  def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, gr.update]:
 
459
  if not history or not history[-1]['content']:
460
  return history, "Error: No final response to check.", gr.update(visible=False)
 
461
  last_user_prompt = ""
462
  for item in reversed(history):
463
  if item['role'] == 'user' and item['content']:
464
  last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
465
  break
466
+ if not last_user_prompt: return history, "Error: Could not find query.", gr.update(visible=False)
 
 
 
467
  web_results = web_search_tool(last_user_prompt)
 
468
  new_history = list(history)
469
  new_history[-1]['content'] += web_results
 
470
  return new_history, "βœ… Double-checked with web facts.", gr.update(visible=False)
471
 
472
+ def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]], Any, gr.update, gr.update, gr.update, gr.update, gr.update]:
473
+ # Use user_turn logic to setup the chat history correctly for the intent flow
474
+ _, chat_history = user_turn(user_message, chat_history, staged_image_input) # Pass staged image
475
+ # Update the last assistant response placeholder with a status message
 
 
 
476
  if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
477
  chat_history[-1]['content'] = "πŸ“Έ Preparing camera capture..."
478
+ # Update UI to show the webcam (start capture simulation)
479
+ # Note: staged_image is NOT cleared here by user_turn
480
+ return "", chat_history, staged_image_input, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value="πŸ“Έ Capturing in 3 seconds...", interactive=False), gr.update(value="βž•")
481
 
482
 
483
  # --- GRADIO INTERFACE ---
484
 
485
  with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
486
+
487
+ # --- (State Components remain the same) ---
488
  stop_signal = gr.State(value=False)
489
+ is_voice_chat = gr.State(value=False)
490
+ staged_image = gr.State(value=None)
491
  menu_visible_state = gr.State(value=False)
492
+
493
  gr.HTML("<h1 style='text-align: center; color: #4B0082;'>πŸŒ™ Luna Chat Space</h1>")
494
+ hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
495
+ file_download_output = gr.File(label="Generated File", visible=False) # Hidden file component
496
 
497
+ with gr.Row(visible=False) as fact_check_btn_row:
498
+ gr.Column(min_width=1); btn_fact_check = gr.Button("Fact Check πŸ”Ž"); gr.Column(min_width=1)
499
 
500
+ chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
 
501
 
 
 
 
 
 
 
 
 
 
 
502
  with gr.Row(visible=False) as webcam_capture_row:
 
503
  webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
504
  close_webcam_btn = gr.Button("βœ… Use this image")
505
+
 
506
  with gr.Row(visible=False) as audio_record_row:
507
  audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
508
+
 
509
  with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
510
+ file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
 
511
  btn_take_photo = gr.Button("πŸ“Έ Google Lens (Take Photo)")
512
  btn_add_files = gr.Button("πŸ“Ž Upload File")
513
 
 
514
  with gr.Row(variant="panel") as input_row:
515
  btn_menu = gr.Button("βž•", interactive=True, size="sm")
516
  txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
517
  mic_btn = gr.Button("πŸŽ™οΈ", interactive=True, size="sm")
518
  combined_btn = gr.Button("✈️", variant="primary", size="sm")
 
 
519
 
520
+ audio_output = gr.Audio(visible=False)
521
+
522
+ # Output components list now reflects the hidden file component
523
  output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
524
 
525
  # --- WIRE EVENTS ---
526
 
527
+ # --- (Menu, File Upload, Take Photo events remain the same) ---
528
+ btn_menu.click(fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False)
529
+ def prepare_file_upload(): return gr.update(visible=False), gr.update(value="βž•"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
 
 
 
 
 
 
 
 
 
530
  btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
531
+ file_input.change(fn=stage_file_upload, inputs=[file_input], outputs=[staged_image, hint_box, txt, file_input], queue=False)
532
+ btn_take_photo.click(fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "πŸ“Έ Camera Active. Capture an image.", gr.update(value="βž•")), inputs=[], outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu], queue=False)
 
 
 
 
 
533
 
534
+ # Webcam Close (stages the image data/path)
 
 
 
 
 
 
 
 
535
  close_webcam_btn.click(
536
+ fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"πŸ“Έ Photo staged: Click send (✈️).", gr.update(value="")),
537
  inputs=[webcam_capture_component],
538
+ outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt], # staged_image gets the NumPy array here
539
  queue=False
540
  )
541
+
542
+ # --- (Mic wiring remains the same, ensure user_turn includes staged_image) ---
543
+ mic_btn.click(fn=lambda: (gr.update(visible=False), gr.update(visible=True), "πŸŽ™οΈ Recording..."), inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False)\
544
+ .then(fn=simulate_recording_delay, inputs=[], outputs=[], queue=False)\
545
+ .then(fn=lambda: (gr.update(visible=True), gr.update(visible=False), "πŸŽ™οΈ Processing..."), inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False)\
546
+ .then(fn=transcribe_audio, inputs=audio_input, outputs=[txt, hint_box, txt, combined_btn, is_voice_chat, fact_check_btn_row], queue=False)\
547
+ .then(fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False) # staged_image is passed but not modified here
548
+ .then(
549
+ fn=chat_generator,
550
+ inputs=[txt, staged_image, chatbot, stop_signal, is_voice_chat], # staged_image is read here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  outputs=output_components,
552
  queue=True,
553
+ ).then(
554
+ fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear staged_image AFTER generation
555
  )
556
 
557
+
558
+ # Main Submission Logic
559
+ # FIX: Pass staged_image to user_turn, but DO NOT modify it there.
560
+ # Clear staged_image using clear_staged_media *after* chat_generator runs.
561
  generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
562
+
 
563
  txt.submit(
 
564
  fn=user_turn,
565
+ inputs=[txt, chatbot, staged_image], # Pass staged_image state
566
+ outputs=[txt, chatbot], # user_turn only outputs text and history
567
  queue=False
568
  ).then(
569
+ fn=chat_generator,
570
+ inputs=generator_inputs, # Use the state value here
571
+ outputs=output_components,
572
  queue=True,
573
+ ).then(
574
+ fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear state AFTER generation
575
  )
576
+
 
577
  combined_btn.click(
 
578
  fn=user_turn,
579
+ inputs=[txt, chatbot, staged_image], # Pass staged_image state
580
+ outputs=[txt, chatbot], # user_turn only outputs text and history
581
  queue=False
582
  ).then(
583
  fn=chat_generator,
584
+ inputs=generator_inputs, # Use the state value here
585
  outputs=output_components,
586
+ queue=True,
587
+ ).then(
588
+ fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False # Clear state AFTER generation
 
 
 
 
 
 
589
  )
590
 
591
+ # --- (Fact Check event remains the same) ---
592
+ btn_fact_check.click(fn=manual_fact_check, inputs=[chatbot], outputs=[chatbot, hint_box, fact_check_btn_row], queue=True)
593
+
594
+ demo.queue(max_size=20).launch(server_name="0.0.0.0")