Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer | |
from PIL import Image | |
import torch | |
# Load the pre-trained model, processor, and tokenizer | |
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
# Set the device to GPU if available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
# Define generation parameters | |
max_length = 16 | |
num_beams = 4 | |
# Function to generate caption from image | |
def generate_caption(image): | |
if image is None: | |
return "Please upload an image." | |
# Convert image to RGB if it's not | |
if image.mode != "RGB": | |
image = image.convert(mode="RGB") | |
# Preprocess the image | |
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values | |
pixel_values = pixel_values.to(device) | |
# Generate caption | |
output_ids = model.generate(pixel_values, max_length=max_length, num_beams=num_beams) | |
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
return caption.strip() | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=generate_caption, | |
inputs=gr.Image(type="pil", label="Upload an Image"), | |
outputs=gr.Textbox(label="Generated Caption"), | |
title="🖼️ AI Image Caption Generator", | |
description="Upload an image, and the AI will generate a descriptive caption for it.", | |
allow_flagging="never" | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |