import gradio as gr from transformers import pipeline from transformers import BlipForQuestionAnswering from transformers.utils import logging logging.set_verbosity_error() from transformers import AutoProcessor tts_pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") model = BlipForQuestionAnswering.from_pretrained( "Salesforce/blip-vqa-base") processor = AutoProcessor.from_pretrained( "Salesforce/blip-vqa-base") def get_pipeline_prediction(pil_image, question): inputs = processor(pil_image, question, return_tensors="pt") out = model.generate(**inputs) text = processor.decode(out[0], skip_special_tokens=True) narrated_text = tts_pipe(text) return (narrated_text["sampling_rate"], narrated_text["audio"][0] ) demo = gr.Interface( fn=get_pipeline_prediction, inputs=[gr.Image(label="Input image", type="pil"), gr.Textbox(label="Ask your question")], outputs=gr.Audio(label="Narration", type="numpy", autoplay=True) ) demo.launch(server_name="0.0.0.0", server_port=7860)