import torch
import clip
from PIL import Image
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def hotornot(image, gender):
    image = Image.fromarray(image.astype("uint8"), "RGB")

    image = preprocess(image).unsqueeze(0).to(device)
    positive_terms = [f'a hot {gender}', f'a beautiful {gender}', f'an attractive {gender}']
    negative_terms = [f'a gross {gender}', f'an ugly {gender}', f'a hideous {gender}']

    pairs = list(zip(positive_terms, negative_terms))

    def evaluate(terms):
        text = clip.tokenize(terms).to(device)

        with torch.no_grad():
            logits_per_image, logits_per_text = model(image, text)
            probs = logits_per_image.softmax(dim=-1).cpu().numpy()
            return probs[0]

    probs = [evaluate(pair) for pair in pairs]
    
    positive_probs = [prob[0] for prob in probs]
    negative_probs = [prob[1] for prob in probs]

    hotness_score = round((probs[0][0] - probs[0][1] + 1) * 50, 2)
    beauty_score = round((probs[1][0] - probs[1][1] + 1) * 50, 2)
    attractiveness_score = round((probs[2][0] - probs[2][1] + 1) * 50, 2)

    hot_score = sum(positive_probs)/len(positive_probs)
    ugly_score = sum(negative_probs)/len(negative_probs)
    composite = ((hot_score - ugly_score)+1) * 50
    composite = round(composite, 2)
    return composite, hotness_score, beauty_score, attractiveness_score

iface = gr.Interface(
    fn=hotornot,
    inputs=[
        gr.inputs.Image(label="Image"),
        gr.inputs.Dropdown(
            [
                'person', 'man', 'woman'
            ],
            default='person',
        )
    ],
    outputs=[
        gr.Textbox(label="Total Hot or Not™ Score"),
        gr.Textbox(label="Hotness Score"),
        gr.Textbox(label="Beauty Score"),
        gr.Textbox(label="Attractiveness Score"),
    ],
    title="Hot or Not",
    description="A simple hot or not app using OpenAI's CLIP model. How it works: the input image is passed to OpenAI's CLIP image captioning model and evaluated for how much it conforms to the model's idea of hotness, beauty, and attractiveness. These values are then combined to produce a composite score on a scale of 0 to 100.",
)
iface.launch()