File size: 1,518 Bytes
aaefaa5 6c77aaa 224bafa 6c77aaa 3e9afa5 6c77aaa 224bafa 6c77aaa 9a86c7a 6c77aaa e3b65d8 9578b89 e3b65d8 c38662a a3b147e 1b97aa8 d3c8f24 a3b147e 224bafa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import gradio as gr
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = BlipProcessor.from_pretrained("noamrot/FuseCap_Image_Captioning")
model = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap_Image_Captioning").to(device)
def inference(raw_image):
text = "a picture of "
inputs = processor(raw_image, text, return_tensors="pt").to(device)
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
return caption
inputs = [gr.Image(type='pil', interactive=False),]
# outputs = gr.outputs.Textbox(label="Caption")
outputs = gr.Textbox(label="Caption")
description = "Gradio demo for FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions. This demo features a BLIP-based model, trained using FuseCap."
examples = [["surfer.jpg"], ["bike.jpg"]]
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2305.17718' target='_blank'>FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions</a>"
iface = gr.Interface(fn=inference,
inputs="image",
outputs="text",
title="FuseCap",
description=description,
article=article,
examples=examples,
# enable_queue=True
)
iface.queue()
iface.launch()
|