deepspeed / scripts /playground /compare_openai_huggingface_clip.py
xingzhikb's picture
init
002bd9b
from transformers import CLIPProcessor, CLIPModel
import clip
from PIL import Image
import requests
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
openai_model, openai_preprocess = clip.load("ViT-B/32", device=device)
dtype = torch.float16
cache_dir = ".cache"
hf_model_name = "openai/clip-vit-base-patch32"
hf_model = CLIPModel.from_pretrained(hf_model_name, cache_dir=cache_dir, torch_dtype=dtype).to(device)
hf_processor = CLIPProcessor.from_pretrained(hf_model_name, cache_dir=cache_dir)
img_url = "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg"
raw_image = Image.open(requests.get(img_url, stream=True).raw)
text = "a photo of a car parking on the side of the road in front of a building"
def hf_clip_inference(raw_image, text):
inputs = hf_processor(text=text, images=raw_image, return_tensors="pt")
for k, v in inputs.items():
if isinstance(v, torch.Tensor):
inputs[k] = v.to(device, dtype if v.dtype == torch.float32 else v.dtype)
breakpoint()
with torch.inference_mode():
outputs = hf_model(**inputs)
return outputs.logits_per_image, inputs["pixel_values"], inputs["input_ids"]
def openai_clip_inference(raw_image, text):
image = openai_preprocess(raw_image).unsqueeze(0).to(device)
text = clip.tokenize(text).to(device)
breakpoint()
with torch.inference_mode():
logits_per_image, _ = openai_model(image, text)
return logits_per_image, image, text
hf_logits, hf_image, hf_text = hf_clip_inference(raw_image, text)
openai_logits, openai_image, openai_text = openai_clip_inference(raw_image, text)
assert torch.allclose(hf_logits, openai_logits)
breakpoint()