from transformers import CLIPProcessor, CLIPModel
import clip
from PIL import Image
import requests
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
openai_model, openai_preprocess = clip.load("ViT-B/32", device=device)

dtype = torch.float16
cache_dir = ".cache"
hf_model_name = "openai/clip-vit-base-patch32"
hf_model = CLIPModel.from_pretrained(hf_model_name, cache_dir=cache_dir, torch_dtype=dtype).to(device)
hf_processor = CLIPProcessor.from_pretrained(hf_model_name, cache_dir=cache_dir)


img_url = "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg"
raw_image = Image.open(requests.get(img_url, stream=True).raw)
text = "a photo of a car parking on the side of the road in front of a building"


def hf_clip_inference(raw_image, text):
    inputs = hf_processor(text=text, images=raw_image, return_tensors="pt")
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            inputs[k] = v.to(device, dtype if v.dtype == torch.float32 else v.dtype)
    breakpoint()
    with torch.inference_mode():
        outputs = hf_model(**inputs)
    return outputs.logits_per_image, inputs["pixel_values"], inputs["input_ids"]


def openai_clip_inference(raw_image, text):
    image = openai_preprocess(raw_image).unsqueeze(0).to(device)
    text = clip.tokenize(text).to(device)
    breakpoint()

    with torch.inference_mode():
        logits_per_image, _ = openai_model(image, text)
    return logits_per_image, image, text


hf_logits, hf_image, hf_text = hf_clip_inference(raw_image, text)
openai_logits, openai_image, openai_text = openai_clip_inference(raw_image, text)
assert torch.allclose(hf_logits, openai_logits)
breakpoint()