import gradio as gr from transformers import BlipForConditionalGeneration, BlipProcessor import torch import tempfile from gtts import gTTS # Load models device = "cpu" processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model_image_captioning = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device) def generate_caption_tts(image): inputs = processor(images=image, return_tensors="pt") inputs["max_length"] = 20 inputs["num_beams"] = 5 outputs = model_image_captioning.generate(**inputs) caption = processor.batch_decode(outputs, skip_special_tokens=True)[0] speech = gTTS(caption, lang="en") tmp_file = tempfile.mkstemp()[1] speech.save(tmp_file) return (caption, tmp_file) title = "Alec图像理解器" description = "Bootstrapping Language-Image Pre-training model演示:引导语言图像预训练以实现统一视觉语言理解和生成。 请上传您的图像" iface = gr.Interface( fn=generate_caption_tts, title=title, description=description, inputs=gr.inputs.Image(shape=(224,224)), outputs=["text", "audio"] ) #iface.launch(share=True, debug=True) iface.launch()