import gradio as gr
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")


def calculate_CLIP_score(image, text):
    words = text.split(";")
    words = [w.strip() for w in words]
    words = list(filter(None, words))

    if len(words) == 0:
        return dict()
    
    inputs = processor(text=words, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image.detach().numpy()

    results_dict = {
        label: score / 100.0 
        for label, score in zip(words, logits_per_image[0])
    }
    
    return results_dict


examples = [
    ["images/two_dogs.jpg", "two dogs playing in the beach; a dog and a dog playing in the beach; beach"],
    ["images/horse_field.jpg", "horse standing in a field; a field; a horse standing"],
    ["images/human.jpg", "a man beside a river; a riverbed; a man"]
]

demo = gr.Interface(
    fn=calculate_CLIP_score,
    inputs=["image", "text"],
    outputs="label",
    examples=examples,
)

demo.launch()