import gradio as gr from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer from PIL import Image import torch # Load the pre-trained model, processor, and tokenizer model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Set the device to GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # Define generation parameters max_length = 16 num_beams = 4 # Function to generate caption from image def generate_caption(image): if image is None: return "Please upload an image." # Convert image to RGB if it's not if image.mode != "RGB": image = image.convert(mode="RGB") # Preprocess the image pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values pixel_values = pixel_values.to(device) # Generate caption output_ids = model.generate(pixel_values, max_length=max_length, num_beams=num_beams) caption = tokenizer.decode(output_ids[0], skip_special_tokens=True) return caption.strip() # Create Gradio interface iface = gr.Interface( fn=generate_caption, inputs=gr.Image(type="pil", label="Upload an Image"), outputs=gr.Textbox(label="Generated Caption"), title="🖼️ AI Image Caption Generator", description="Upload an image, and the AI will generate a descriptive caption for it.", allow_flagging="never" ) # Launch the app if __name__ == "__main__": iface.launch()