import torch import gradio as gr from transformers import CLIPProcessor, CLIPModel import spaces # Check if CUDA is available and set the device accordingly device = "cuda" if torch.cuda.is_available() else "cpu" model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device) processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") @spaces.GPU def calculate_score(image, text): labels = text.split(";") labels = [l.strip() for l in labels] labels = list(filter(None, labels)) if len(labels) == 0: return dict() inputs = processor(text=labels, images=image, return_tensors="pt", padding=True) inputs = { k: v.to(device) for k, v in inputs.items() } # Move tensors to the appropriate device outputs = model(**inputs) logits_per_image = ( outputs.logits_per_image.detach().cpu().numpy() ) # Move results back to CPU for further processing results_dict = { label: score / 100.0 for label, score in zip(labels, logits_per_image[0]) } return results_dict with gr.Blocks() as demo: gr.Markdown("# CLIP Score") gr.Markdown( "Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text" ) with gr.Row(): image_input = gr.Image() output_label = gr.Label() text_input = gr.Textbox(label="Descriptions (separated by semicolons)") image_input.change( fn=calculate_score, inputs=[image_input, text_input], outputs=output_label ) text_input.submit( fn=calculate_score, inputs=[image_input, text_input], outputs=output_label ) gr.Examples( examples=[ [ "cat.jpg", "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void", ] ], fn=calculate_score, inputs=[image_input, text_input], outputs=output_label, ) demo.launch()