from transformers import ( Blip2VisionConfig, Blip2QFormerConfig, OPTConfig, Blip2Config, Blip2ForConditionalGeneration, Blip2VisionModel, Blip2Processor, AutoProcessor ) from PIL import Image import requests import torch import gradio as gr config = Blip2Config() model = Blip2ForConditionalGeneration(config) config = model.config vis_config = Blip2VisionConfig() model = Blip2VisionModel(vis_config) config_2 = model.config processor = AutoProcessor.from_pretrained('Salesforce/blip-image-captioning-large') model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-large') def captioning(image): inputs = processor(images=image, return_tensors='pt') generated_ids = model.generate(**inputs) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() return image, generated_text demo = gr.Interface( captioning, inputs=gr.Image(type="pil"), outputs = ['image', 'text'] ) demo.launch()