Files changed (1) hide show
  1. app.py +64 -15
app.py CHANGED
@@ -1,29 +1,78 @@
1
  import gradio as gr
2
- from project_model import process_inputs
 
3
 
 
 
4
 
5
- def handle_inputs(image, audio):
6
- if image is None or audio is None:
7
- return "Please upload both an image and an audio clip.", None, None
8
- message, answer_audio = process_inputs(image, audio)
9
- return message, image, answer_audio
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  with gr.Blocks() as demo:
12
- gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output")
13
 
14
  with gr.Row():
15
  with gr.Column():
16
- image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
17
- audio_input = gr.Audio(label="Record Voice", sources=["microphone"], type="filepath")
18
- submit_btn = gr.Button("Submit")
19
-
 
 
 
 
 
 
 
 
 
 
 
20
  with gr.Column():
21
- status_output = gr.Textbox(label="Status", interactive=False)
22
- image_display = gr.Image(label="Processed Image")
 
23
  audio_output = gr.Audio(label="Answer Audio", interactive=False)
 
24
 
25
- submit_btn.click(fn=handle_inputs, inputs=[image_input, audio_input],
26
- outputs=[status_output, image_display, audio_output])
 
 
 
 
27
 
 
28
  if __name__ == "__main__":
29
  demo.launch(show_error=True, share=True)
 
1
  import gradio as gr
2
+ from PIL import Image
3
+ from project_module import process_inputs, VisualQAState
4
 
5
+ # Create a session object to manage conversation state per image
6
+ session = VisualQAState()
7
 
8
+ # Global variables to keep track of the current image and the conversation history
9
+ current_image = None
10
+ chat_history = []
 
 
11
 
12
+ # Unified handler for new questions or new images
13
+ def handle_inputs(new_image, audio, followup_text, tts_enabled):
14
+ global current_image, chat_history
15
+
16
+ # If a new image is uploaded, reset current session state
17
+ if new_image is not None:
18
+ current_image = new_image
19
+ chat_history.clear() # Clear previous Q&A history
20
+ question = followup_text.strip() if followup_text else "Describe the image"
21
+ # Process the new image and question/audio
22
+ response, audio_output = process_inputs(
23
+ session, image=current_image, question=question, audio_path=audio, enable_tts=tts_enabled
24
+ )
25
+ elif current_image is not None and (followup_text or audio):
26
+ # Follow-up question for current image
27
+ question = followup_text.strip() if followup_text else ""
28
+ response, audio_output = process_inputs(
29
+ session, image=None, question=question, audio_path=audio, enable_tts=tts_enabled
30
+ )
31
+ else:
32
+ # No input given
33
+ return "Please upload an image and ask a question.", None, None, ""
34
+
35
+ # Append the Q&A to chat history
36
+ chat_history.append(f"🗨️ **Q:** {question}\n🧠 **A:** {response}")
37
+ history_text = "\n\n".join(chat_history) # Format chat as markdown
38
+
39
+ return response, current_image, audio_output if tts_enabled else None, history_text
40
+
41
+ # Build the Gradio UI
42
  with gr.Blocks() as demo:
43
+ gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output + History")
44
 
45
  with gr.Row():
46
  with gr.Column():
47
+ # User inputs
48
+ image_input = gr.Image(
49
+ label="Upload or Capture New Image", sources=["upload", "webcam"], type="pil"
50
+ )
51
+ audio_input = gr.Audio(
52
+ label="Record Voice (Optional)", sources=["microphone"], type="filepath"
53
+ )
54
+ followup_text = gr.Textbox(
55
+ label="Type a Question or Follow-up", placeholder="e.g., What’s going on?"
56
+ )
57
+ tts_toggle = gr.Checkbox(
58
+ label="Enable Audio Response", value=True
59
+ )
60
+ submit_btn = gr.Button("Ask")
61
+
62
  with gr.Column():
63
+ # Outputs: Answer, image shown back, audio (if enabled), and chat history
64
+ status_output = gr.Textbox(label="Answer", interactive=False)
65
+ image_display = gr.Image(label="Current Image")
66
  audio_output = gr.Audio(label="Answer Audio", interactive=False)
67
+ chat_box = gr.Markdown(label="Chat History")
68
 
69
+ # Link the submit button to the handler function
70
+ submit_btn.click(
71
+ fn=handle_inputs,
72
+ inputs=[image_input, audio_input, followup_text, tts_toggle],
73
+ outputs=[status_output, image_display, audio_output, chat_box]
74
+ )
75
 
76
+ # Launch the app
77
  if __name__ == "__main__":
78
  demo.launch(show_error=True, share=True)