import gradio as gr from PIL import Image import torch from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer # Load model and processor model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # Captioning function def generate_caption(image): # Choose image from upload or webcam if image is None: return "No image provided." # Preprocess pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device) # Generate output_ids = model.generate(pixel_values, max_length=16, num_beams=4) caption = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip() return caption # Build Gradio UI with gr.Blocks() as demo: gr.Markdown("# Image Captioning with Gradio") with gr.Row(): upload_input = gr.Image(sources=["upload", "webcam", "clipboard"], type="pil", label="Upload Image") output_text = gr.Textbox(label="Caption", interactive=False) generate_btn = gr.Button("Generate Caption") generate_btn.click( fn=generate_caption, inputs=upload_input, outputs=output_text ) demo.launch()