cosmosai471 commited on
Commit
6ac02a9
Β·
verified Β·
1 Parent(s): c2a3849

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -122
app.py CHANGED
@@ -17,18 +17,22 @@ from io import BytesIO
17
  import numpy as np
18
 
19
  # --- CONFIGURATION & INITIALIZATION ---
20
- STT_DEVICE = "cpu"
21
  os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
22
  AUDIO_DIR = "audio_outputs"
23
- DOC_DIR = "doc_outputs"
24
  if not os.path.exists(AUDIO_DIR):
25
  os.makedirs(AUDIO_DIR)
26
  if not os.path.exists(DOC_DIR):
27
  os.makedirs(DOC_DIR)
28
  REPO_ID = "cosmosai471/Luna-v3"
29
  MODEL_FILE = "luna.gguf"
30
- LOCAL_MODEL_PATH = MODEL_FILE
31
- SYSTEM_PROMPT = "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. Your full response must follow these tags."
 
 
 
 
32
 
33
  # Configuration: confidence threshold for triggering web search fallback
34
  CONFIDENCE_THRESHOLD = 30 # only trigger web-search fallback if confidence is less than this
@@ -51,10 +55,10 @@ try:
51
  print("Initializing Llama...")
52
  llm = Llama(
53
  model_path=LOCAL_MODEL_PATH,
54
- n_ctx=8192,
55
- n_threads=4,
56
- n_batch=256,
57
- n_gpu_layers=0,
58
  verbose=False
59
  )
60
  print("βœ… Luna Model loaded successfully!")
@@ -62,6 +66,7 @@ except Exception as e:
62
  print(f"❌ Error loading Luna model: {e}")
63
  class DummyLLM:
64
  def create_completion(self, *args, **kwargs):
 
65
  yield {'choices': [{'text': '[Intent: qa_general][Confidence: 0] ERROR: Luna model failed to load. Check logs and resources.'}]}
66
  llm = DummyLLM()
67
 
@@ -74,7 +79,7 @@ except Exception as e:
74
 
75
  image_pipe = None
76
  try:
77
- VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
78
  image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
79
  print(f"βœ… Loaded {VLM_MODEL_ID} for image processing.")
80
  except Exception as e:
@@ -83,7 +88,7 @@ except Exception as e:
83
  img_gen_pipe = None
84
  try:
85
  img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
86
- img_gen_pipe.to(STT_DEVICE)
87
  print("βœ… Loaded Stable Diffusion (v1-5) for image generation.")
88
  except Exception as e:
89
  print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
@@ -93,65 +98,58 @@ except Exception as e:
93
 
94
  def simulate_recording_delay():
95
  time.sleep(3)
96
- return None
97
 
98
  def clean_response_stream(raw_text: str) -> str:
99
  """Cleans up raw response text by removing tags and repeats.
100
- NOTE: do NOT split on the plain word 'Intent' or 'Action' because that would
101
- chop off the tags and/or the rest of the response in many outputs.
102
  """
103
- # Safely cut at common separators that mark model output boundaries
104
  clean_text = re.split(r'\nUser:|\nAssistant:|</s>', raw_text, 1)[0].strip()
105
  # Remove bracketed instruction tokens and inline actions
106
  clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
107
- # Remove explicit tags if they are present (we remove them from visible output)
108
- clean_text = re.sub(r'\[Intent:\s*[\w\-]+\]|\[Confidence:\s*\d{1,3}\]', '', clean_text, flags=re.IGNORECASE).strip()
109
- # Remove repeated trailing words (simple dedupe heuristic)
110
  words = clean_text.split()
111
- if len(words) > 4 and words[-2:] == words[-4:-2]:
112
  clean_text = ' '.join(words[:-2])
113
  return clean_text
114
 
115
  def web_search_tool(query: str) -> str:
116
- time.sleep(1.5)
117
  print(f"Simulating Google Search fallback for: {query}")
118
  return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
119
 
120
  def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
121
  """Checks confidence from the raw response tag and triggers fallback if very low.
122
 
123
- Improvements:
124
- - Uses a robust regex for confidence.
125
- - If the response lacks a confidence tag, uses a simple length-based heuristic
126
- to decide whether to consider confidence low or high (avoids defaulting to 0).
127
- - Only triggers the web-search fallback when confidence is < CONFIDENCE_THRESHOLD.
128
  """
129
- # Try to extract explicit confidence tag
130
  confidence_match = re.search(r'\[Confidence:\s*([0-9]{1,3})\]', raw_response_with_tags, flags=re.IGNORECASE)
131
  cleaned_response = clean_response_stream(raw_response_with_tags)
132
 
133
  if confidence_match:
134
  try:
135
  confidence_score = int(confidence_match.group(1))
136
- # clamp to 0-100
137
  confidence_score = max(0, min(confidence_score, 100))
138
  except Exception:
139
  confidence_score = 0
140
  else:
141
- # heuristic: if the cleaned response is short/empty -> likely low-confidence output
142
  if not cleaned_response or len(cleaned_response.strip()) < 30:
143
- confidence_score = 10 # very low: trigger fallback
144
  else:
145
- confidence_score = 85 # assume decent confidence when there's a substantial response
146
 
147
- # Decide whether to invoke web search fallback
148
  if confidence_score < CONFIDENCE_THRESHOLD:
149
- print(f"Low confidence ({confidence_score}%) detected (threshold={CONFIDENCE_THRESHOLD}). Triggering Google Search fallback.")
150
  search_snippet = web_search_tool(prompt)
151
  if "error" in cleaned_response.lower() or confidence_score <= 5:
152
- final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
153
  else:
154
- # keep whatever content exists, then add web results to supplement
155
  final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
156
  else:
157
  final_response = cleaned_response
@@ -159,7 +157,10 @@ def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> st
159
  return final_response
160
 
161
  def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
162
- """Uses the VLM pipeline (LLaVA) for VQA."""
 
 
 
163
  global image_pipe
164
  success = False
165
  if image_pipe is None:
@@ -169,25 +170,33 @@ def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
169
  try:
170
  if isinstance(image_data_or_path, str):
171
  image = Image.open(image_data_or_path).convert("RGB")
172
- elif isinstance(image_data_or_path, np.ndarray):
173
  image = Image.fromarray(image_data_or_path).convert("RGB")
174
 
175
  if image:
176
  vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
177
  results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
178
- raw_vlm_output = results[0]['generated_text'] if results else "Error: VLM did not return text."
179
- vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip()
180
- if not vqa_response: vqa_response = "VLM analysis failed or returned empty."
 
 
 
 
 
 
 
 
 
181
 
182
  del image
183
- success = True
184
  prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
185
  return prompt_injection, success
186
-
187
  except Exception as e:
188
  print(f"Image Pipeline Error: {e}")
189
  return f"[Image Processing Error: {e}] **User Query:** {message}", success
190
-
191
  return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
192
 
193
  def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
@@ -198,11 +207,11 @@ def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.upda
198
  transcribed_text = stt_pipe(audio_file_path)["text"]
199
  new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
200
  return (
201
- transcribed_text.strip(),
202
- f"πŸŽ™οΈ Transcribed: '{transcribed_text.strip()}'",
203
- gr.update(interactive=True),
204
- new_button_update,
205
- True,
206
  gr.update(visible=False)
207
  )
208
  except Exception as e:
@@ -211,14 +220,14 @@ def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.upda
211
 
212
  def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
213
  if not is_voice_chat:
214
- return None
215
  clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$|\(file=.*?\)', '', text, flags=re.DOTALL | re.MULTILINE)
216
  if len(clean_text.strip()) > 5:
217
  try:
218
  audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
219
  tts = gTTS(text=clean_text.strip(), lang='en')
220
  tts.save(audio_output_path)
221
- return audio_output_path
222
  except Exception as e:
223
  print(f"gTTS Error: {e}")
224
  return None
@@ -239,22 +248,35 @@ INTENT_STATUS_MAP = {
239
  }
240
 
241
  def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
242
- """Parses intent/confidence, returns intent, status, cleaned text."""
243
- match = re.search(r'\[Intent:\s*([\w\-]+)\]', raw_response, re.IGNORECASE)
244
  intent = match.group(1).lower() if match else "default"
245
  if is_vqa_flow:
246
  intent = "vqa"
247
- cleaned_text = re.sub(r'\[Intent:\s*[\w\-]+\]\s*', '', raw_response, count=1, flags=re.IGNORECASE).strip()
 
248
  cleaned_text = re.sub(r'\[Confidence:\s*\d{1,3}\]\s*', '', cleaned_text, count=1, flags=re.IGNORECASE).strip()
 
249
  status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
250
  return intent, status, cleaned_text
251
 
252
  def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
253
- """Generates a file (Image, DOCX, PPTX) and returns the file path for download."""
 
 
 
254
  file_path = None
255
  try:
 
 
 
 
 
 
 
256
  if file_type == "image":
257
- if img_gen_pipe is None: raise RuntimeError("Image generation model not loaded.")
 
258
  image = img_gen_pipe(content).images[0]
259
  file_filename = f"generated_img_{random.randint(1000, 9999)}.png"
260
  file_path = os.path.join(DOC_DIR, file_filename)
@@ -272,13 +294,18 @@ def generate_file_content(content: str, history: List[Dict[str, str]], file_type
272
  prs = Presentation()
273
  slide = prs.slides.add_slide(prs.slide_layouts[0])
274
  slide.shapes.title.text = "Luna Generated Presentation"
275
- slide.placeholders[1].text = content[:100] + "..."
 
 
 
 
276
  file_filename = f"generated_ppt_{random.randint(1000, 9999)}.pptx"
277
  file_path = os.path.join(DOC_DIR, file_filename)
278
  prs.save(file_path)
279
  display_content = f"πŸ“Š **Presentation Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
280
  else:
281
  raise ValueError(f"Unknown file type: {file_type}")
 
282
  history[-1]['content'] = display_content
283
  except Exception as e:
284
  error_msg = f"❌ **Error generating {file_type.upper()}:** {e}. Check logs/libs."
@@ -288,102 +315,134 @@ def generate_file_content(content: str, history: List[Dict[str, str]], file_type
288
 
289
  # --- CORE GENERATOR FUNCTION ---
290
  def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
291
- # Component Outputs: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output (INVISIBLE)]
 
 
 
 
 
292
 
293
- if len(history) < 2 or history[-1]['role'] != 'assistant' or history[-1]['content'] != "":
294
- yield history, False, "Error: Generator called in unexpected state.", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
 
295
  return
296
 
297
- last_user_index = len(history) - 2
298
- original_message = history[last_user_index]['content']
299
 
 
300
  is_vqa_flow = False
301
- if isinstance(image_input_data, str):
302
  is_vqa_flow = image_input_data != ""
303
- elif isinstance(image_input_data, np.ndarray):
304
- is_vqa_flow = image_input_data.size > 0
305
- else:
306
  is_vqa_flow = image_input_data is not None
307
 
 
308
  vqa_success = False
 
309
  if is_vqa_flow:
310
  processed_message, vqa_success = process_image(image_input_data, original_message)
 
311
  history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
312
  llm_input_message = processed_message
313
- else:
314
- llm_input_message = original_message
315
- image_input_data = None
316
 
 
317
  prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
318
- for item in history[:-1]:
319
  role = item['role'].upper()
320
  content = item['content'] if item['content'] is not None else ""
321
- if role == "ASSISTANT": prompt += f"LUNA: {content}\n"
322
- elif role == "USER": prompt += f"USER: {content}\n"
 
 
323
  prompt += f"USER: {llm_input_message}\nLUNA: "
324
 
325
- hint_text = "✨ Luna is starting to think..."
326
- history[-1]['content'] = ""
327
- yield history, stop_signal, hint_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
328
- time.sleep(0.5)
 
 
 
329
 
330
  full_response = ""
331
- current_intent = "default"
332
-
333
  try:
334
  stream = llm.create_completion(
335
- prompt=prompt, max_tokens=8192,
336
  stop=["USER:", "SYSTEM:", "</s>"],
337
  echo=False, stream=True, temperature=0.7
338
  )
339
  except Exception as e:
340
  error_text = f"❌ Error generating response: {e}"
 
341
  history[-1]['content'] = error_text
342
  yield history, False, error_text, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
343
  return
344
 
 
345
  try:
346
  for output in stream:
347
  token = output["choices"][0].get("text", "")
348
  full_response += token
349
  current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
350
- history[-1]['content'] = display_text
 
 
351
  yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
352
  except Exception as e:
 
353
  _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
354
  error_msg = f"⚠️ Streaming interrupted: {e}"
355
- history[-1]['content'] = final_response_text
356
  yield history, False, error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=True), image_input_data, gr.update(), gr.update()
357
  return
358
 
359
- # 5. POST-PROCESSING & TOOL EXECUTION
360
  file_download_path = None
361
  _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success)
362
 
 
363
  if current_intent == "image_generate":
364
- yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
365
- history, file_download_path = generate_file_content(content_for_tool, history, "image")
 
 
 
 
366
  elif current_intent == "doc_generate":
367
- yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
368
- history, file_download_path = generate_file_content(content_for_tool, history, "doc")
 
 
 
 
369
  elif current_intent == "ppt_generate":
370
- yield history, stop_signal, INTENT_STATUS_MAP[current_intent], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
371
- history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
 
 
 
 
372
  elif current_intent == "open_google":
373
  final_cleaned_response = content_for_tool + "\n\nπŸ”— **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
374
  history[-1]['content'] = final_cleaned_response
375
  elif current_intent == "open_camera":
376
  final_cleaned_response = content_for_tool + "\n\nπŸ“Έ **Action:** Use the 'Google Lens' button to capture an image."
377
  history[-1]['content'] = final_cleaned_response
378
-
379
- TOOL_EXECUTION_INTENTS = ["image_generate", "doc_generate", "ppt_generate", "open_google", "open_camera", "vqa"]
380
- if current_intent not in TOOL_EXECUTION_INTENTS:
381
  final_response_content = check_confidence_and_augment(full_response, original_message)
382
  history[-1]['content'] = final_response_content
383
- else:
384
- final_response_content = history[-1]['content']
385
 
386
- audio_file_path = text_to_audio(final_response_content, is_voice_chat)
 
 
 
 
387
 
388
  hint = "βœ… Response generated."
389
  yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(), file_download_path
@@ -392,12 +451,16 @@ def chat_generator(message_from_input: str, image_input_data: Any, history: List
392
  # --- GRADIO WRAPPERS FOR UI ACTIONS ---
393
 
394
  def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
395
- new_visibility = not current_visibility
396
  return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "βž•")
397
 
398
  def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
399
- """Appends the user message to the chat history if text or image is provided."""
400
- has_text = bool(user_message)
 
 
 
 
401
  has_image = False
402
  if isinstance(staged_image_input, str):
403
  has_image = staged_image_input != ""
@@ -409,17 +472,17 @@ def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_imag
409
  if not has_text and not has_image:
410
  return user_message, chat_history
411
 
412
- if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] == "":
413
- return user_message, chat_history
 
414
 
415
  if not has_text and has_image:
416
  user_message_to_add = "Analyzing Staged Media."
417
  else:
418
- user_message_to_add = user_message
419
-
420
- chat_history.append({"role": "user", "content": user_message_to_add})
421
- chat_history.append({"role": "assistant", "content": ""})
422
 
 
 
423
  return "", chat_history
424
 
425
  def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
@@ -439,7 +502,8 @@ def manual_fact_check(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str
439
  if item['role'] == 'user' and item['content']:
440
  last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
441
  break
442
- if not last_user_prompt: return history, "Error: Could not find query.", gr.update(visible=False)
 
443
  web_results = web_search_tool(last_user_prompt)
444
  new_history = list(history)
445
  new_history[-1]['content'] += web_results
@@ -455,32 +519,32 @@ def auto_capture_camera(user_message: str, chat_history: List[Dict[str, str]], s
455
  # --- GRADIO INTERFACE ---
456
 
457
  with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
458
-
459
  # --- State Components ---
460
  stop_signal = gr.State(value=False)
461
- is_voice_chat = gr.State(value=False)
462
- staged_image = gr.State(value=None)
463
  menu_visible_state = gr.State(value=False)
464
-
465
  gr.HTML("<h1 style='text-align: center; color: #4B0082;'>πŸŒ™ Luna Chat Space</h1>")
466
 
467
- hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
468
- file_download_output = gr.File(label="Generated File", visible=False)
469
 
470
  with gr.Row(visible=False) as fact_check_btn_row:
471
  gr.Column(min_width=1); btn_fact_check = gr.Button("Fact Check πŸ”Ž"); gr.Column(min_width=1)
472
 
473
- chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
474
-
475
  with gr.Row(visible=False) as webcam_capture_row:
476
  webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
477
  close_webcam_btn = gr.Button("βœ… Use this image")
478
-
479
  with gr.Row(visible=False) as audio_record_row:
480
  audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
481
-
482
  with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
483
- file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
484
  btn_take_photo = gr.Button("πŸ“Έ Google Lens (Take Photo)")
485
  btn_add_files = gr.Button("πŸ“Ž Upload File")
486
 
@@ -489,20 +553,19 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
489
  txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
490
  mic_btn = gr.Button("πŸŽ™οΈ", interactive=True, size="sm")
491
  combined_btn = gr.Button("✈️", variant="primary", size="sm")
492
-
493
- audio_output = gr.Audio(visible=False)
494
 
495
  output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
496
 
497
  # --- WIRE EVENTS ---
498
-
499
  btn_menu.click(
500
  fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False
501
  )
502
-
503
  def prepare_file_upload(): return gr.update(visible=False), gr.update(value="βž•"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
504
  btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
505
-
506
  file_input.change(
507
  fn=stage_file_upload, inputs=[file_input], outputs=[staged_image, hint_box, txt, file_input], queue=False
508
  )
@@ -511,12 +574,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
511
  fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "πŸ“Έ Camera Active. Capture an image.", gr.update(value="βž•")),
512
  inputs=[], outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu], queue=False
513
  )
514
-
515
  close_webcam_btn.click(
516
  fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"πŸ“Έ Photo staged: Click send (✈️).", gr.update(value="")),
517
  inputs=[webcam_capture_component], outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt], queue=False
518
  )
519
-
520
  mic_btn.click(
521
  fn=lambda: (gr.update(visible=False), gr.update(visible=True), "πŸŽ™οΈ Recording..."),
522
  inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False
@@ -536,7 +599,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
536
  )
537
 
538
  generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
539
-
540
  # Text submit (Enter key)
541
  txt.submit(
542
  fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
@@ -545,7 +608,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
545
  ).then(
546
  fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
547
  )
548
-
549
  # Send button click
550
  combined_btn.click(
551
  fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
@@ -554,7 +617,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
554
  ).then(
555
  fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
556
  )
557
-
558
  btn_fact_check.click(
559
  fn=manual_fact_check, inputs=[chatbot], outputs=[chatbot, hint_box, fact_check_btn_row], queue=True
560
  )
 
17
  import numpy as np
18
 
19
  # --- CONFIGURATION & INITIALIZATION ---
20
+ STT_DEVICE = "cpu"
21
  os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
22
  AUDIO_DIR = "audio_outputs"
23
+ DOC_DIR = "doc_outputs"
24
  if not os.path.exists(AUDIO_DIR):
25
  os.makedirs(AUDIO_DIR)
26
  if not os.path.exists(DOC_DIR):
27
  os.makedirs(DOC_DIR)
28
  REPO_ID = "cosmosai471/Luna-v3"
29
  MODEL_FILE = "luna.gguf"
30
+ LOCAL_MODEL_PATH = MODEL_FILE
31
+ SYSTEM_PROMPT = (
32
+ "You are Luna, a helpful and friendly AI assistant. Your response must begin with two separate "
33
+ "tags: an **Intent** tag and a **Confidence** tag (0-100). Example: '[Intent: qa_general][Confidence: 85]'. "
34
+ "Your full response must follow these tags."
35
+ )
36
 
37
  # Configuration: confidence threshold for triggering web search fallback
38
  CONFIDENCE_THRESHOLD = 30 # only trigger web-search fallback if confidence is less than this
 
55
  print("Initializing Llama...")
56
  llm = Llama(
57
  model_path=LOCAL_MODEL_PATH,
58
+ n_ctx=8192,
59
+ n_threads=4,
60
+ n_batch=256,
61
+ n_gpu_layers=0,
62
  verbose=False
63
  )
64
  print("βœ… Luna Model loaded successfully!")
 
66
  print(f"❌ Error loading Luna model: {e}")
67
  class DummyLLM:
68
  def create_completion(self, *args, **kwargs):
69
+ # yield one piece to mimic streaming
70
  yield {'choices': [{'text': '[Intent: qa_general][Confidence: 0] ERROR: Luna model failed to load. Check logs and resources.'}]}
71
  llm = DummyLLM()
72
 
 
79
 
80
  image_pipe = None
81
  try:
82
+ VLM_MODEL_ID = "llava-hf/llava-1.5-7b-hf"
83
  image_pipe = pipeline("image-to-text", model=VLM_MODEL_ID, device=STT_DEVICE)
84
  print(f"βœ… Loaded {VLM_MODEL_ID} for image processing.")
85
  except Exception as e:
 
88
  img_gen_pipe = None
89
  try:
90
  img_gen_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32)
91
+ img_gen_pipe.to(STT_DEVICE)
92
  print("βœ… Loaded Stable Diffusion (v1-5) for image generation.")
93
  except Exception as e:
94
  print(f"⚠️ Could not load Image Generation pipeline. Image generation disabled. Error: {e}")
 
98
 
99
  def simulate_recording_delay():
100
  time.sleep(3)
101
+ return None
102
 
103
  def clean_response_stream(raw_text: str) -> str:
104
  """Cleans up raw response text by removing tags and repeats.
105
+ We intentionally DO NOT split on plain words 'Intent' or 'Action' to avoid chopping tags.
 
106
  """
107
+ # Cut at common separators marking model streaming boundaries
108
  clean_text = re.split(r'\nUser:|\nAssistant:|</s>', raw_text, 1)[0].strip()
109
  # Remove bracketed instruction tokens and inline actions
110
  clean_text = re.sub(r'\[/?INST\]|\[/?s\]|\s*<action>.*?</action>\s*', '', clean_text, flags=re.DOTALL).strip()
111
+ # Remove tags for display ([Intent: ...], [Confidence: ...]) β€” keep them for parsing elsewhere
112
+ clean_text = re.sub(r'\[Intent:\s*[\w\-\_]+\]|\[Confidence:\s*\d{1,3}\]', '', clean_text, flags=re.IGNORECASE).strip()
113
+ # Deduplicate trailing repeated words (simple heuristic)
114
  words = clean_text.split()
115
+ if len(words) > 4 and words[-2:] == words[-4:-2]:
116
  clean_text = ' '.join(words[:-2])
117
  return clean_text
118
 
119
  def web_search_tool(query: str) -> str:
120
+ time.sleep(1.5)
121
  print(f"Simulating Google Search fallback for: {query}")
122
  return f"\n\n🌐 **Web Search Results for '{query}':** I've gathered information from external sources to supplement my knowledge."
123
 
124
  def check_confidence_and_augment(raw_response_with_tags: str, prompt: str) -> str:
125
  """Checks confidence from the raw response tag and triggers fallback if very low.
126
 
127
+ - If explicit [Confidence: N] exists, use it.
128
+ - Otherwise fall back to heuristic based on cleaned response length.
129
+ - Only triggers web search if below CONFIDENCE_THRESHOLD.
 
 
130
  """
 
131
  confidence_match = re.search(r'\[Confidence:\s*([0-9]{1,3})\]', raw_response_with_tags, flags=re.IGNORECASE)
132
  cleaned_response = clean_response_stream(raw_response_with_tags)
133
 
134
  if confidence_match:
135
  try:
136
  confidence_score = int(confidence_match.group(1))
 
137
  confidence_score = max(0, min(confidence_score, 100))
138
  except Exception:
139
  confidence_score = 0
140
  else:
141
+ # heuristic: very short or empty cleaned response -> low confidence
142
  if not cleaned_response or len(cleaned_response.strip()) < 30:
143
+ confidence_score = 10
144
  else:
145
+ confidence_score = 85
146
 
 
147
  if confidence_score < CONFIDENCE_THRESHOLD:
148
+ print(f"Low confidence ({confidence_score}%) detected (threshold={CONFIDENCE_THRESHOLD}). Triggering web-search fallback.")
149
  search_snippet = web_search_tool(prompt)
150
  if "error" in cleaned_response.lower() or confidence_score <= 5:
151
+ final_response = f"I apologize for the limited response (Confidence: {confidence_score}%). {search_snippet} I will use this to generate a more comprehensive answer."
152
  else:
 
153
  final_response = f"{cleaned_response} {search_snippet} I can elaborate further based on this."
154
  else:
155
  final_response = cleaned_response
 
157
  return final_response
158
 
159
  def process_image(image_data_or_path: Any, message: str) -> Tuple[str, bool]:
160
+ """Perform VQA via the image_pipe. Returns a prompt-injection string for the LLM and success flag.
161
+
162
+ If the VLM fails or returns nothing meaningful, return helpful instructions to the LLM rather than empty.
163
+ """
164
  global image_pipe
165
  success = False
166
  if image_pipe is None:
 
170
  try:
171
  if isinstance(image_data_or_path, str):
172
  image = Image.open(image_data_or_path).convert("RGB")
173
+ elif isinstance(image_data_or_path, np.ndarray):
174
  image = Image.fromarray(image_data_or_path).convert("RGB")
175
 
176
  if image:
177
  vqa_prompt = f"USER: <image>\n{message}\nASSISTANT:"
178
  results = image_pipe(image, prompt=vqa_prompt, generate_kwargs={"max_new_tokens": 1024})
179
+ raw_vlm_output = results[0].get('generated_text', "") if results and isinstance(results, list) else ""
180
+ vqa_response = raw_vlm_output.split("ASSISTANT:")[-1].strip() if raw_vlm_output else ""
181
+
182
+ # If empty or nonsense, produce a friendly fallback message
183
+ if not vqa_response:
184
+ vqa_response = (
185
+ "VQA analysis returned no clear answer. Possible reasons: image unreadable, wrong crop, or "
186
+ "ambiguous content. Please re-upload a clearer image or provide more context about what you want."
187
+ )
188
+ success = False
189
+ else:
190
+ success = True
191
 
192
  del image
 
193
  prompt_injection = f"**VQA Analysis:** {vqa_response}\n\n**User Query:** {message}"
194
  return prompt_injection, success
195
+
196
  except Exception as e:
197
  print(f"Image Pipeline Error: {e}")
198
  return f"[Image Processing Error: {e}] **User Query:** {message}", success
199
+
200
  return f"[Image Processing Error: Could not load image data.] **User Query:** {message}", success
201
 
202
  def transcribe_audio(audio_file_path: str) -> Tuple[str, str, gr.update, gr.update, bool, gr.update]:
 
207
  transcribed_text = stt_pipe(audio_file_path)["text"]
208
  new_button_update = gr.update(value="↑", interactive=True, elem_classes=["circle-btn", "send-mode"])
209
  return (
210
+ transcribed_text.strip(),
211
+ f"πŸŽ™οΈ Transcribed: '{transcribed_text.strip()}'",
212
+ gr.update(interactive=True),
213
+ new_button_update,
214
+ True,
215
  gr.update(visible=False)
216
  )
217
  except Exception as e:
 
220
 
221
  def text_to_audio(text: str, is_voice_chat: bool) -> str or None:
222
  if not is_voice_chat:
223
+ return None
224
  clean_text = re.sub(r'```.*?```|\[Image Processing Error:.*?\]|\*\*Web Search Results:.*?$|\(file=.*?\)', '', text, flags=re.DOTALL | re.MULTILINE)
225
  if len(clean_text.strip()) > 5:
226
  try:
227
  audio_output_path = os.path.join(AUDIO_DIR, f"luna_response_{random.randint(1000, 9999)}.mp3")
228
  tts = gTTS(text=clean_text.strip(), lang='en')
229
  tts.save(audio_output_path)
230
+ return audio_output_path
231
  except Exception as e:
232
  print(f"gTTS Error: {e}")
233
  return None
 
248
  }
249
 
250
  def get_intent_status(raw_response: str, is_vqa_flow: bool) -> Tuple[str, str, str]:
251
+ """Parses intent (and removes tags for display). Returns (intent, status, cleaned_text_for_display)."""
252
+ match = re.search(r'\[Intent:\s*([\w\-\_]+)\]', raw_response, re.IGNORECASE)
253
  intent = match.group(1).lower() if match else "default"
254
  if is_vqa_flow:
255
  intent = "vqa"
256
+ # Remove only the display tags, keep raw_response intact elsewhere
257
+ cleaned_text = re.sub(r'\[Intent:\s*[\w\-\_]+\]\s*', '', raw_response, count=1, flags=re.IGNORECASE).strip()
258
  cleaned_text = re.sub(r'\[Confidence:\s*\d{1,3}\]\s*', '', cleaned_text, count=1, flags=re.IGNORECASE).strip()
259
+ cleaned_text = clean_response_stream(cleaned_text) # extra clean
260
  status = INTENT_STATUS_MAP.get(intent, INTENT_STATUS_MAP["default"])
261
  return intent, status, cleaned_text
262
 
263
  def generate_file_content(content: str, history: List[Dict[str, str]], file_type: str):
264
+ """Generates a file (Image, DOCX, PPTX) and returns the file path for download.
265
+
266
+ If content is too short or missing, ask the user to clarify instead of producing empty files.
267
+ """
268
  file_path = None
269
  try:
270
+ if not content or len(content.strip()) < 20:
271
+ history[-1]['content'] = (
272
+ f"⚠️ I was instructed to generate a {file_type}, but I don't have enough details. "
273
+ "Could you please provide a short description or title for the file (what should it contain)?"
274
+ )
275
+ return history, None
276
+
277
  if file_type == "image":
278
+ if img_gen_pipe is None:
279
+ raise RuntimeError("Image generation model not loaded.")
280
  image = img_gen_pipe(content).images[0]
281
  file_filename = f"generated_img_{random.randint(1000, 9999)}.png"
282
  file_path = os.path.join(DOC_DIR, file_filename)
 
294
  prs = Presentation()
295
  slide = prs.slides.add_slide(prs.slide_layouts[0])
296
  slide.shapes.title.text = "Luna Generated Presentation"
297
+ try:
298
+ slide.placeholders[1].text = content[:200] + "..."
299
+ except Exception:
300
+ # fallback if layout mismatch
301
+ pass
302
  file_filename = f"generated_ppt_{random.randint(1000, 9999)}.pptx"
303
  file_path = os.path.join(DOC_DIR, file_filename)
304
  prs.save(file_path)
305
  display_content = f"πŸ“Š **Presentation Generated!** Summary:\n\n{content[:200]}...\n\n[Download {file_filename}](file={file_path})"
306
  else:
307
  raise ValueError(f"Unknown file type: {file_type}")
308
+
309
  history[-1]['content'] = display_content
310
  except Exception as e:
311
  error_msg = f"❌ **Error generating {file_type.upper()}:** {e}. Check logs/libs."
 
315
 
316
  # --- CORE GENERATOR FUNCTION ---
317
  def chat_generator(message_from_input: str, image_input_data: Any, history: List[Dict[str, str]], stop_signal: bool, is_voice_chat: bool) -> Any:
318
+ """
319
+ Returns: [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
320
+ Changes made:
321
+ - user_turn will now only append the user message. We add the assistant entry here once generation starts,
322
+ so there's no empty assistant box created prematurely.
323
+ """
324
 
325
+ # Validate that last item is a USER (we expect user_turn to add only the user record)
326
+ if not history or history[-1]['role'] != 'user':
327
+ yield history, False, "Error: Generator called in unexpected state (no user message found).", gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
328
  return
329
 
330
+ last_user_index = len(history) - 1
331
+ original_message = history[last_user_index]['content'] if history[last_user_index]['content'] is not None else ""
332
 
333
+ # Detect VQA flow
334
  is_vqa_flow = False
335
+ if isinstance(image_input_data, str):
336
  is_vqa_flow = image_input_data != ""
337
+ elif isinstance(image_input_data, np.ndarray):
338
+ is_vqa_flow = image_input_data.size > 0
339
+ else:
340
  is_vqa_flow = image_input_data is not None
341
 
342
+ # Process image if present (returns prompt injection for LLM)
343
  vqa_success = False
344
+ llm_input_message = original_message
345
  if is_vqa_flow:
346
  processed_message, vqa_success = process_image(image_input_data, original_message)
347
+ # Replace the user's content with tag for logging while preserving original_message separately
348
  history[last_user_index]['content'] = f"[IMAGE RECEIVED] {original_message}"
349
  llm_input_message = processed_message
 
 
 
350
 
351
+ # Build prompt (system + conversation)
352
  prompt = f"SYSTEM: {SYSTEM_PROMPT}\n"
353
+ for item in history[:-1]: # all conversation before last user
354
  role = item['role'].upper()
355
  content = item['content'] if item['content'] is not None else ""
356
+ if role == "ASSISTANT":
357
+ prompt += f"LUNA: {content}\n"
358
+ elif role == "USER":
359
+ prompt += f"USER: {content}\n"
360
  prompt += f"USER: {llm_input_message}\nLUNA: "
361
 
362
+ # Now create assistant entry only when we begin generation (avoids empty assistant box)
363
+ assistant_initial_text = "✨ Luna is starting to think..."
364
+ history.append({"role": "assistant", "content": assistant_initial_text})
365
+
366
+ # Early UI update to show the thinking state (assistant box will appear now)
367
+ yield history, stop_signal, assistant_initial_text, gr.update(value="", interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
368
+ time.sleep(0.2)
369
 
370
  full_response = ""
371
+ current_intent = "default"
372
+
373
  try:
374
  stream = llm.create_completion(
375
+ prompt=prompt, max_tokens=8192,
376
  stop=["USER:", "SYSTEM:", "</s>"],
377
  echo=False, stream=True, temperature=0.7
378
  )
379
  except Exception as e:
380
  error_text = f"❌ Error generating response: {e}"
381
+ # update assistant with error
382
  history[-1]['content'] = error_text
383
  yield history, False, error_text, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=False), image_input_data, gr.update(), gr.update()
384
  return
385
 
386
+ # Stream tokens and update assistant content incrementally (without exposing tags)
387
  try:
388
  for output in stream:
389
  token = output["choices"][0].get("text", "")
390
  full_response += token
391
  current_intent, current_hint, display_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
392
+ # display_text is cleaned (no [Intent] or [Confidence])
393
+ # Ensure we never set assistant content to empty β€” if cleaned is empty, show a small typing indicator
394
+ history[-1]['content'] = display_text if display_text.strip() else "✨ Luna is forming a reply..."
395
  yield history, stop_signal, current_hint, gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
396
  except Exception as e:
397
+ # Stream interruption β€” salvage what we have
398
  _, _, final_response_text = get_intent_status(full_response, is_vqa_flow and vqa_success)
399
  error_msg = f"⚠️ Streaming interrupted: {e}"
400
+ history[-1]['content'] = final_response_text if final_response_text.strip() else error_msg
401
  yield history, False, error_msg, gr.update(interactive=True), gr.update(value="↑", interactive=True), None, False, gr.update(visible=True), image_input_data, gr.update(), gr.update()
402
  return
403
 
404
+ # POST-PROCESSING & TOOL EXECUTION
405
  file_download_path = None
406
  _, _, content_for_tool = get_intent_status(full_response, is_vqa_flow and vqa_success)
407
 
408
+ # If model wants to run a tool but content is weak, ask for clarification instead of generating empty files
409
  if current_intent == "image_generate":
410
+ if not content_for_tool or len(content_for_tool.strip()) < 20:
411
+ history[-1]['content'] = "I detected a request to generate an image but I don't have enough prompt details. Please give a short description: e.g. 'sunset over mountains, vibrant colors'."
412
+ else:
413
+ history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
414
+ yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
415
+ history, file_download_path = generate_file_content(content_for_tool, history, "image")
416
  elif current_intent == "doc_generate":
417
+ if not content_for_tool or len(content_for_tool.strip()) < 20:
418
+ history[-1]['content'] = "I was asked to generate a document but I need more details β€” what's the document about? (1–2 sentences.)"
419
+ else:
420
+ history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
421
+ yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
422
+ history, file_download_path = generate_file_content(content_for_tool, history, "doc")
423
  elif current_intent == "ppt_generate":
424
+ if not content_for_tool or len(content_for_tool.strip()) < 20:
425
+ history[-1]['content'] = "I can make a short presentation, but please give me a title and 3–5 bullet points to include."
426
+ else:
427
+ history[-1]['content'] = INTENT_STATUS_MAP[current_intent]
428
+ yield history, stop_signal, history[-1]['content'], gr.update(interactive=False), gr.update(value="Stop ⏹️", interactive=True), None, is_voice_chat, gr.update(visible=False), image_input_data, gr.update(), gr.update()
429
+ history, file_download_path = generate_file_content(content_for_tool, history, "ppt")
430
  elif current_intent == "open_google":
431
  final_cleaned_response = content_for_tool + "\n\nπŸ”— **Action:** [Search Google](https://www.google.com/search?q=open+google+simulated+search)"
432
  history[-1]['content'] = final_cleaned_response
433
  elif current_intent == "open_camera":
434
  final_cleaned_response = content_for_tool + "\n\nπŸ“Έ **Action:** Use the 'Google Lens' button to capture an image."
435
  history[-1]['content'] = final_cleaned_response
436
+ else:
437
+ # Normal response path β€” check confidence and maybe augment with web-search snippet
 
438
  final_response_content = check_confidence_and_augment(full_response, original_message)
439
  history[-1]['content'] = final_response_content
 
 
440
 
441
+ # If after all processing the assistant content is empty (defensive), fill a friendly fallback
442
+ if not history[-1]['content'] or not str(history[-1]['content']).strip():
443
+ history[-1]['content'] = "Sorry β€” I couldn't produce a good response. Can you rephrase or give more details?"
444
+
445
+ audio_file_path = text_to_audio(history[-1]['content'], is_voice_chat)
446
 
447
  hint = "βœ… Response generated."
448
  yield history, False, hint, gr.update(interactive=True), gr.update(value="↑", interactive=True), audio_file_path, False, gr.update(visible=True), gr.update(value=None), gr.update(), file_download_path
 
451
  # --- GRADIO WRAPPERS FOR UI ACTIONS ---
452
 
453
  def toggle_menu(current_visibility: bool) -> Tuple[bool, gr.update, gr.update, gr.update]:
454
+ new_visibility = not current_visibility
455
  return new_visibility, gr.update(visible=new_visibility), gr.update(visible=False), gr.update(value="⬇️" if new_visibility else "βž•")
456
 
457
  def user_turn(user_message: str, chat_history: List[Dict[str, str]], staged_image_input: Any) -> Tuple[str, List[Dict[str, str]]]:
458
+ """
459
+ Appends only the USER message to chat_history. We no longer append an assistant placeholder here,
460
+ so the UI won't show an empty assistant box immediately after user sends a message.
461
+ The assistant will be appended inside chat_generator when generation begins.
462
+ """
463
+ has_text = bool(user_message and user_message.strip())
464
  has_image = False
465
  if isinstance(staged_image_input, str):
466
  has_image = staged_image_input != ""
 
472
  if not has_text and not has_image:
473
  return user_message, chat_history
474
 
475
+ # Prevent double-sending if assistant is already generating (detect last assistant placeholder)
476
+ if chat_history and chat_history[-1]['role'] == 'assistant' and chat_history[-1]['content'] and "thinking" in chat_history[-1]['content'].lower():
477
+ return user_message, chat_history
478
 
479
  if not has_text and has_image:
480
  user_message_to_add = "Analyzing Staged Media."
481
  else:
482
+ user_message_to_add = user_message.strip()
 
 
 
483
 
484
+ chat_history.append({"role": "user", "content": user_message_to_add})
485
+ # do NOT append assistant here β€” chat_generator will append assistant entry when it starts
486
  return "", chat_history
487
 
488
  def stage_file_upload(file_path: str) -> Tuple[Any, str, gr.update, gr.update]:
 
502
  if item['role'] == 'user' and item['content']:
503
  last_user_prompt = item['content'].split("**User Query:**")[-1].strip().replace("[IMAGE RECEIVED]", "").strip()
504
  break
505
+ if not last_user_prompt:
506
+ return history, "Error: Could not find query.", gr.update(visible=False)
507
  web_results = web_search_tool(last_user_prompt)
508
  new_history = list(history)
509
  new_history[-1]['content'] += web_results
 
519
  # --- GRADIO INTERFACE ---
520
 
521
  with gr.Blocks(theme=gr.themes.Soft(), title="Luna Coding Partner") as demo:
522
+
523
  # --- State Components ---
524
  stop_signal = gr.State(value=False)
525
+ is_voice_chat = gr.State(value=False)
526
+ staged_image = gr.State(value=None)
527
  menu_visible_state = gr.State(value=False)
528
+
529
  gr.HTML("<h1 style='text-align: center; color: #4B0082;'>πŸŒ™ Luna Chat Space</h1>")
530
 
531
+ hint_box = gr.Textbox(value="Ask anything", lines=1, show_label=False, interactive=False, placeholder="Luna's Action...", visible=True)
532
+ file_download_output = gr.File(label="Generated File", visible=False)
533
 
534
  with gr.Row(visible=False) as fact_check_btn_row:
535
  gr.Column(min_width=1); btn_fact_check = gr.Button("Fact Check πŸ”Ž"); gr.Column(min_width=1)
536
 
537
+ chatbot = gr.Chatbot(label="Luna", height=500, type='messages')
538
+
539
  with gr.Row(visible=False) as webcam_capture_row:
540
  webcam_capture_component = gr.Image(sources=["webcam"], type="numpy", show_label=False)
541
  close_webcam_btn = gr.Button("βœ… Use this image")
542
+
543
  with gr.Row(visible=False) as audio_record_row:
544
  audio_input = gr.Audio(sources=["microphone"], type="filepath", show_label=False)
545
+
546
  with gr.Column(visible=False, elem_id="menu_options_row") as menu_options_row:
547
+ file_input = gr.File(type="filepath", label="File Uploader", interactive=False)
548
  btn_take_photo = gr.Button("πŸ“Έ Google Lens (Take Photo)")
549
  btn_add_files = gr.Button("πŸ“Ž Upload File")
550
 
 
553
  txt = gr.Textbox(placeholder="Ask anything", show_label=False, lines=1, autofocus=True)
554
  mic_btn = gr.Button("πŸŽ™οΈ", interactive=True, size="sm")
555
  combined_btn = gr.Button("✈️", variant="primary", size="sm")
556
+
557
+ audio_output = gr.Audio(visible=False)
558
 
559
  output_components = [chatbot, stop_signal, hint_box, txt, combined_btn, audio_output, is_voice_chat, fact_check_btn_row, staged_image, file_input, file_download_output]
560
 
561
  # --- WIRE EVENTS ---
 
562
  btn_menu.click(
563
  fn=toggle_menu, inputs=[menu_visible_state], outputs=[menu_visible_state, menu_options_row, fact_check_btn_row, btn_menu], queue=False
564
  )
565
+
566
  def prepare_file_upload(): return gr.update(visible=False), gr.update(value="βž•"), gr.update(visible=False), gr.update(interactive=True), gr.update(value="")
567
  btn_add_files.click(fn=prepare_file_upload, inputs=[], outputs=[menu_options_row, btn_menu, fact_check_btn_row, file_input, txt], queue=False)
568
+
569
  file_input.change(
570
  fn=stage_file_upload, inputs=[file_input], outputs=[staged_image, hint_box, txt, file_input], queue=False
571
  )
 
574
  fn=lambda: (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "πŸ“Έ Camera Active. Capture an image.", gr.update(value="βž•")),
575
  inputs=[], outputs=[menu_options_row, webcam_capture_row, input_row, hint_box, btn_menu], queue=False
576
  )
577
+
578
  close_webcam_btn.click(
579
  fn=lambda img: (gr.update(visible=True), gr.update(visible=False), img, f"πŸ“Έ Photo staged: Click send (✈️).", gr.update(value="")),
580
  inputs=[webcam_capture_component], outputs=[input_row, webcam_capture_row, staged_image, hint_box, txt], queue=False
581
  )
582
+
583
  mic_btn.click(
584
  fn=lambda: (gr.update(visible=False), gr.update(visible=True), "πŸŽ™οΈ Recording..."),
585
  inputs=[], outputs=[input_row, audio_record_row, hint_box], queue=False
 
599
  )
600
 
601
  generator_inputs = [txt, staged_image, chatbot, stop_signal, is_voice_chat]
602
+
603
  # Text submit (Enter key)
604
  txt.submit(
605
  fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
 
608
  ).then(
609
  fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
610
  )
611
+
612
  # Send button click
613
  combined_btn.click(
614
  fn=user_turn, inputs=[txt, chatbot, staged_image], outputs=[txt, chatbot], queue=False
 
617
  ).then(
618
  fn=clear_staged_media, inputs=[], outputs=[staged_image], queue=False
619
  )
620
+
621
  btn_fact_check.click(
622
  fn=manual_fact_check, inputs=[chatbot], outputs=[chatbot, hint_box, fact_check_btn_row], queue=True
623
  )