import gradio as gr from transformers import CLIPProcessor, CLIPModel from PIL import Image import torch model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") def generate_caption(image): if image is None: return "No image uploaded." # Candidate text prompts texts = [ "a photo of a cat", "a photo of a dog", "a photo of a man", "a photo of a woman", "a photo of a laptop", "a photo of a smartphone", "a photo of a city", "a photo of a landscape", "a photo of food", "a photo of a car" ] inputs = processor(text=texts, images=image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image # image-text similarity scores probs = logits_per_image.softmax(dim=1) # convert to probabilities best_match = torch.argmax(probs).item() caption = texts[best_match] return f"Best match: {caption} (Confidence: {probs[0][best_match].item():.2f})" iface = gr.Interface( fn=generate_caption, inputs=gr.Image(type="pil"), outputs=gr.Textbox(label="Generated Caption"), title="Image Captioning with CLIP", description="Upload an image and get a dynamically generated caption using CLIP.

Detectable categories:

a photo of a cat
a photo of a dog
a photo of a man
a photo of a woman
a photo of a laptop

a photo of a smartphone
a photo of a city
a photo of a landscape
a photo of food
a photo of a car

" ) iface.launch()