File size: 4,300 Bytes
d60d34b 3ccdd83 acda6c7 9270f3d d60d34b acda6c7 9270f3d 61b7eee 9270f3d acda6c7 73f9f45 61b7eee 3601eff 9270f3d 61b7eee 9270f3d 61b7eee 7fc3bdc c890be1 9270f3d 3ccdd83 61b7eee 9270f3d 3ccdd83 9270f3d 7fc3bdc 9270f3d 7fc3bdc 9270f3d 3601eff acda6c7 9270f3d d60d34b 9270f3d d60d34b 61b7eee d60d34b 9270f3d 61b7eee 9270f3d 61b7eee 9270f3d f93e53d 9270f3d f93e53d 9270f3d d60d34b 9270f3d d60d34b 61b7eee f93e53d 3601eff 9270f3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import torch
import torch.nn.functional as F
import gradio as gr
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel
import spaces
# Dictionary of available models with their image sizes
MODELS = {
"CLIP ViT-B/32": ("openai/clip-vit-base-patch32", 224, "clip"),
"CLIP ViT-B/16": ("openai/clip-vit-base-patch16", 224, "clip"),
"CLIP ViT-L/14": ("openai/clip-vit-large-patch14", 224, "clip"),
"CLIP ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336, "clip"),
"SigLIP Large/16-256": ("google/siglip-large-patch16-256", 256, "siglip"),
"SigLIP Base/16-384": ("google/siglip-base-patch16-384", 384, "siglip"),
"SigLIP Large/16-384": ("google/siglip-large-patch16-384", 384, "siglip"),
}
# Initialize models and processors
models = {}
processors = {}
for model_name, (model_path, _, model_type) in MODELS.items():
if model_type == "clip":
models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
processors[model_name] = CLIPProcessor.from_pretrained(model_path)
elif model_type == "siglip":
models[model_name] = AutoModel.from_pretrained(model_path).to("cuda")
processors[model_name] = AutoProcessor.from_pretrained(model_path)
@spaces.GPU
def calculate_score(image, text, model_name):
labels = text.split(";")
labels = [l.strip() for l in labels]
labels = list(filter(None, labels))
if len(labels) == 0:
return dict()
model = models[model_name]
processor = processors[model_name]
model_type = MODELS[model_name][2]
# Preprocess the image and text
inputs = processor(text=labels, images=[image], return_tensors="pt", padding="max_length")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Calculate embeddings
with torch.no_grad():
outputs = model(**inputs)
if model_type == "clip":
image_embeds = outputs.image_embeds
text_embeds = outputs.text_embeds
elif model_type == "siglip":
image_embeds = outputs.image_embeds
text_embeds = outputs.text_embeds
# Normalize embeddings
image_embeds = F.normalize(image_embeds, p=2, dim=1)
text_embeds = F.normalize(text_embeds, p=2, dim=1)
# Calculate similarity
if model_type == "clip":
# For CLIP, use cosine similarity
similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
similarities = torch.clamp(similarities, min=0, max=1)
elif model_type == "siglip":
# For SigLIP, use sigmoid on dot product
logits = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
similarities = torch.sigmoid(logits)
# Convert to numpy array
similarities = similarities.cpu().numpy()
results_dict = {label: float(score) for label, score in zip(labels, similarities)}
return results_dict
with gr.Blocks() as demo:
gr.Markdown("# Multi-Model CLIP and SigLIP Score")
gr.Markdown(
"Calculate the score (cosine similarity) between the given image and text descriptions using different CLIP and SigLIP model variants"
)
with gr.Row():
image_input = gr.Image(type="pil")
output_label = gr.Label()
with gr.Row():
text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
model_dropdown = gr.Dropdown(
choices=list(MODELS.keys()), label="Model", value="CLIP ViT-B/16"
)
def process_inputs(image, text, model_name):
if image is None or text.strip() == "":
return None
return calculate_score(image, text, model_name)
inputs = [image_input, text_input, model_dropdown]
outputs = output_label
image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)
gr.Examples(
examples=[
[
"cat.jpg",
"a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
"CLIP ViT-B/16",
]
],
fn=process_inputs,
inputs=inputs,
outputs=outputs,
)
demo.launch()
|