deepspeed / scripts /playground /compare_openai_huggingface_clip.py

init

002bd9b 11 months ago

1.73 kB

	from transformers import CLIPProcessor, CLIPModel
	import clip
	from PIL import Image
	import requests
	import torch

	device = "cuda" if torch.cuda.is_available() else "cpu"
	openai_model, openai_preprocess = clip.load("ViT-B/32", device=device)

	dtype = torch.float16
	cache_dir = ".cache"
	hf_model_name = "openai/clip-vit-base-patch32"
	hf_model = CLIPModel.from_pretrained(hf_model_name, cache_dir=cache_dir, torch_dtype=dtype).to(device)
	hf_processor = CLIPProcessor.from_pretrained(hf_model_name, cache_dir=cache_dir)


	img_url = "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg"
	raw_image = Image.open(requests.get(img_url, stream=True).raw)
	text = "a photo of a car parking on the side of the road in front of a building"


	def hf_clip_inference(raw_image, text):
	inputs = hf_processor(text=text, images=raw_image, return_tensors="pt")
	for k, v in inputs.items():
	if isinstance(v, torch.Tensor):
	inputs[k] = v.to(device, dtype if v.dtype == torch.float32 else v.dtype)
	breakpoint()
	with torch.inference_mode():
	outputs = hf_model(**inputs)
	return outputs.logits_per_image, inputs["pixel_values"], inputs["input_ids"]


	def openai_clip_inference(raw_image, text):
	image = openai_preprocess(raw_image).unsqueeze(0).to(device)
	text = clip.tokenize(text).to(device)
	breakpoint()

	with torch.inference_mode():
	logits_per_image, _ = openai_model(image, text)
	return logits_per_image, image, text


	hf_logits, hf_image, hf_text = hf_clip_inference(raw_image, text)
	openai_logits, openai_image, openai_text = openai_clip_inference(raw_image, text)
	assert torch.allclose(hf_logits, openai_logits)
	breakpoint()