import gradio as gr from transformers import CLIPProcessor, CLIPModel model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") def CLIP_calculate_score(image, text): words = text.split(";") words = [w.strip() for w in words] words = list(filter(None, words)) if len(words) == 0: return dict() inputs = processor(text=words, images=image, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image.detach().numpy() results_dict = { label: score / 100.0 for label, score in zip(words, logits_per_image[0]) } return results_dict examples = [ ["images/girl_and_dog.jpg", "a dog playing in the beach; a dog and a girl playing in the beach; a girl playing in the beach"], ["images/horse.jpg", "group of horses running; a dog playing; a horse standing"], ["images/man_and_cat.jpg", "a man and a cat listening to music; a cat; a man"] ] demo = gr.Interface( fn=CLIP_calculate_score, inputs=["image", "text"], outputs="label", examples=examples, # allow_flagging="never", # cache_examples=True, ) demo.launch()