|
from transformers import CLIPProcessor, CLIPModel |
|
import clip |
|
from PIL import Image |
|
import requests |
|
import torch |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
openai_model, openai_preprocess = clip.load("ViT-B/32", device=device) |
|
|
|
dtype = torch.float16 |
|
cache_dir = ".cache" |
|
hf_model_name = "openai/clip-vit-base-patch32" |
|
hf_model = CLIPModel.from_pretrained(hf_model_name, cache_dir=cache_dir, torch_dtype=dtype).to(device) |
|
hf_processor = CLIPProcessor.from_pretrained(hf_model_name, cache_dir=cache_dir) |
|
|
|
|
|
img_url = "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg" |
|
raw_image = Image.open(requests.get(img_url, stream=True).raw) |
|
text = "a photo of a car parking on the side of the road in front of a building" |
|
|
|
|
|
def hf_clip_inference(raw_image, text): |
|
inputs = hf_processor(text=text, images=raw_image, return_tensors="pt") |
|
for k, v in inputs.items(): |
|
if isinstance(v, torch.Tensor): |
|
inputs[k] = v.to(device, dtype if v.dtype == torch.float32 else v.dtype) |
|
breakpoint() |
|
with torch.inference_mode(): |
|
outputs = hf_model(**inputs) |
|
return outputs.logits_per_image, inputs["pixel_values"], inputs["input_ids"] |
|
|
|
|
|
def openai_clip_inference(raw_image, text): |
|
image = openai_preprocess(raw_image).unsqueeze(0).to(device) |
|
text = clip.tokenize(text).to(device) |
|
breakpoint() |
|
|
|
with torch.inference_mode(): |
|
logits_per_image, _ = openai_model(image, text) |
|
return logits_per_image, image, text |
|
|
|
|
|
hf_logits, hf_image, hf_text = hf_clip_inference(raw_image, text) |
|
openai_logits, openai_image, openai_text = openai_clip_inference(raw_image, text) |
|
assert torch.allclose(hf_logits, openai_logits) |
|
breakpoint() |
|
|