Spaces:
Runtime error
Runtime error
import gradio as gr | |
from transformers import BlipForConditionalGeneration, BlipProcessor | |
import torch | |
import tempfile | |
from gtts import gTTS | |
# Load models | |
device = "cpu" | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
model_image_captioning = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device) | |
def generate_caption_tts(image): | |
inputs = processor(images=image, return_tensors="pt") | |
inputs["max_length"] = 20 | |
inputs["num_beams"] = 5 | |
outputs = model_image_captioning.generate(**inputs) | |
caption = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
speech = gTTS(caption, lang="en") | |
tmp_file = tempfile.mkstemp()[1] | |
speech.save(tmp_file) | |
return (caption, tmp_file) | |
title = "Alec图像理解器" | |
description = "Bootstrapping Language-Image Pre-training model演示:引导语言图像预训练以实现统一视觉语言理解和生成。 请上传您的图像" | |
iface = gr.Interface( | |
fn=generate_caption_tts, | |
title=title, | |
description=description, | |
inputs=gr.inputs.Image(shape=(224,224)), | |
outputs=["text", "audio"] | |
) | |
#iface.launch(share=True, debug=True) | |
iface.launch() |