import gradio as gr from transformers import CLIPProcessor, CLIPModel model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") def calculate_score(image, text): labels = text.split(";") labels = [l.strip() for l in labels] labels = list(filter(None, labels)) if len(labels) == 0: return dict() inputs = processor(text=labels, images=image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image.detach().numpy() results_dict = { label: score / 100.0 for label, score in zip(labels, logits_per_image[0]) } return results_dict if __name__ == "__main__": cat_example = [ "cat.jpg", "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void", ] demo = gr.Interface( fn=calculate_score, inputs=["image", "text"], outputs="label", examples=[cat_example], allow_flagging="never", description="# CLIP Score", article="Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text", cache_examples=True, ) demo.launch()