import gradio as gr from huggingface_hub import InferenceClient from model import generate_answer from tts import text_to_speech from dotenv import load_dotenv load_dotenv() from stt import transcribe_audio def ask_image(image, audio_file=None): """Process an image and audio question, returning text and audio response""" # Handle audio input if audio_file is not None: question = transcribe_audio(audio_file) else: question = "What do you see in this image?" # Process image with VLM response = generate_answer(image, question) # Generate audio from response audio_path = text_to_speech(response) return question, response, audio_path # Create the Ask-the-Image interface with gr.Blocks() as image_interface: gr.Markdown("# Ask the Image") gr.Markdown("Upload an image and record your question or just upload an image to see what's in it.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Image") audio_input = gr.Audio(sources="microphone", type="filepath", label="Record Question (Max 10 seconds)", max_length=10) submit_btn = gr.Button("Ask") with gr.Column(): question_output = gr.Textbox(label="Transcribed Question") answer_output = gr.Textbox(label="Answer") audio_output = gr.Audio(label="Spoken Answer") submit_btn.click( fn=ask_image, inputs=[image_input, audio_input], outputs=[question_output, answer_output, audio_output] ) # Create a tabbed interface demo = gr.TabbedInterface( [ image_interface], [ "Ask the Image"] ) if __name__ == "__main__": demo.launch()