import gradio as gr from transformers import pipeline img_text_pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") def describe_image(file_path): img_text_pip_output = img_text_pipe(file_path) description_text = img_text_pip_output[0]['generated_text'] print(description_text) narrated_text = narrator(description_text) (narrated_text["sampling_rate"], narrated_text["audio"][0] ) return (narrated_text["sampling_rate"], narrated_text["audio"][0]) iface = gr.Interface(fn=describe_image, inputs=gr.Image(label="Input image", type="pil"), outputs=gr.Audio(label="Narration", type="numpy", autoplay=True) ) iface.launch()