Spaces:

saa231
/

MutimodalVisionAssistant

Paused

App Files Files Community

Update app.py

by saa231 - opened Apr 22

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+64

-15

Files changed (1) hide show

app.py +64 -15

app.py CHANGED Viewed

@@ -1,29 +1,78 @@
 import gradio as gr
-from project_model import process_inputs
-def handle_inputs(image, audio):
-    if image is None or audio is None:
-        return "Please upload both an image and an audio clip.", None, None
-    message, answer_audio = process_inputs(image, audio)
-    return message, image, answer_audio
 with gr.Blocks() as demo:
-    gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output")
     with gr.Row():
         with gr.Column():
-            image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
-            audio_input = gr.Audio(label="Record Voice", sources=["microphone"], type="filepath")
-            submit_btn = gr.Button("Submit")
         with gr.Column():
-            status_output = gr.Textbox(label="Status", interactive=False)
-            image_display = gr.Image(label="Processed Image")
             audio_output = gr.Audio(label="Answer Audio", interactive=False)
-    submit_btn.click(fn=handle_inputs, inputs=[image_input, audio_input],
-                     outputs=[status_output, image_display, audio_output])
 if __name__ == "__main__":
     demo.launch(show_error=True, share=True)

 import gradio as gr
+from PIL import Image
+from project_module import process_inputs, VisualQAState
+# Create a session object to manage conversation state per image
+session = VisualQAState()
+# Global variables to keep track of the current image and the conversation history
+current_image = None
+chat_history = []
+# Unified handler for new questions or new images
+def handle_inputs(new_image, audio, followup_text, tts_enabled):
+    global current_image, chat_history
+    # If a new image is uploaded, reset current session state
+    if new_image is not None:
+        current_image = new_image
+        chat_history.clear()  # Clear previous Q&A history
+        question = followup_text.strip() if followup_text else "Describe the image"
+        # Process the new image and question/audio
+        response, audio_output = process_inputs(
+            session, image=current_image, question=question, audio_path=audio, enable_tts=tts_enabled
+        )
+    elif current_image is not None and (followup_text or audio):
+        # Follow-up question for current image
+        question = followup_text.strip() if followup_text else ""
+        response, audio_output = process_inputs(
+            session, image=None, question=question, audio_path=audio, enable_tts=tts_enabled
+        )
+    else:
+        # No input given
+        return "Please upload an image and ask a question.", None, None, ""
+    # Append the Q&A to chat history
+    chat_history.append(f"🗨️ **Q:** {question}\n🧠 **A:** {response}")
+    history_text = "\n\n".join(chat_history)  # Format chat as markdown
+    return response, current_image, audio_output if tts_enabled else None, history_text
+# Build the Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output + History")
     with gr.Row():
         with gr.Column():
+            # User inputs
+            image_input = gr.Image(
+                label="Upload or Capture New Image", sources=["upload", "webcam"], type="pil"
+            )
+            audio_input = gr.Audio(
+                label="Record Voice (Optional)", sources=["microphone"], type="filepath"
+            )
+            followup_text = gr.Textbox(
+                label="Type a Question or Follow-up", placeholder="e.g., What’s going on?"
+            )
+            tts_toggle = gr.Checkbox(
+                label="Enable Audio Response", value=True
+            )
+            submit_btn = gr.Button("Ask")
         with gr.Column():
+            # Outputs: Answer, image shown back, audio (if enabled), and chat history
+            status_output = gr.Textbox(label="Answer", interactive=False)
+            image_display = gr.Image(label="Current Image")
             audio_output = gr.Audio(label="Answer Audio", interactive=False)
+            chat_box = gr.Markdown(label="Chat History")
+    # Link the submit button to the handler function
+    submit_btn.click(
+        fn=handle_inputs,
+        inputs=[image_input, audio_input, followup_text, tts_toggle],
+        outputs=[status_output, image_display, audio_output, chat_box]
+    )
+# Launch the app
 if __name__ == "__main__":
     demo.launch(show_error=True, share=True)