import gradio as gr from transformers import CLIPProcessor, CLIPModel model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") def calculate_CLIP_score(image, text): words = text.split(";") words = [w.strip() for w in words] words = list(filter(None, words)) if len(words) == 0: return dict() inputs = processor(text=words, images=image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image.detach().numpy() results_dict = { label: score / 100.0 for label, score in zip(words, logits_per_image[0]) } return results_dict examples = [ ["images/two_dogs.jpg", "two dogs playing in the beach; a dog and a dog playing in the beach; beach"], ["images/horse_field.jpg", "horse standing in a field; a field; a horse standing"], ["images/human.jpg", "a man beside a river; a riverbed; a man"] ] demo = gr.Interface( fn=calculate_CLIP_score, inputs=["image", "text"], outputs="label", examples=examples, ) demo.launch()