import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def generate_caption(image):
if image is None:
return "No image uploaded."
# Candidate text prompts
texts = [
"a photo of a cat",
"a photo of a dog",
"a photo of a man",
"a photo of a woman",
"a photo of a laptop",
"a photo of a smartphone",
"a photo of a city",
"a photo of a landscape",
"a photo of food",
"a photo of a car"
]
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # image-text similarity scores
probs = logits_per_image.softmax(dim=1) # convert to probabilities
best_match = torch.argmax(probs).item()
caption = texts[best_match]
return f"Best match: {caption} (Confidence: {probs[0][best_match].item():.2f})"
iface = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs=gr.Textbox(label="Generated Caption"),
title="Image Captioning with CLIP",
description="Upload an image and get a dynamically generated caption using CLIP.
Detectable categories: