import re import gradio as gr from PIL import Image from transformers import AutoProcessor, AutoModelForCausalLM device='cpu' processor = AutoProcessor.from_pretrained("microsoft/git-base") model = AutoModelForCausalLM.from_pretrained("nkasmanoff/git-planet").to(device) def predict(image,max_length=64,device='cpu'): pixel_values = processor(images=image, return_tensors="pt").to(device).pixel_values generated_ids = model.generate(pixel_values=pixel_values, max_length=max_length) generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_caption input = gr.inputs.Image(label="Upload your Image", type = 'pil', optional=True) output = gr.outputs.Textbox(type="text",label="Captions") title = "Image Captioning" interface = gr.Interface( fn=predict, inputs = input, theme="grass", outputs=output, title=title, ) interface.launch(debug=True)