jienengchen
/

ViTamin-XL-384px

Feature Extraction

Model card Files Files and versions Community

bbexx commited on Apr 3

Commit

e7cb1dc

•

1 Parent(s): e3885ae

UPDATE README.md

Files changed (1) hide show

README.md +11 -5

README.md CHANGED Viewed

@@ -11,21 +11,27 @@ Load from HuggingFace:
 import torch
 from PIL import Image
 from transformers import AutoModel, CLIPImageProcessor
 model = AutoModel.from_pretrained(
     'jienengchen/ViTamin-XL-384px',
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-    trust_remote_code=True).cuda().eval()
 image = Image.open('./image.png').convert('RGB')
 image_processor = CLIPImageProcessor.from_pretrained('jienengchen/ViTamin-XL-384px')
 pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
 pixel_values = pixel_values.to(torch.bfloat16).cuda()
-outputs = model(pixel_values)
 ```
 ## Main Results with CLIP Pre-training on DataComp-1B

 import torch
 from PIL import Image
 from transformers import AutoModel, CLIPImageProcessor
+device = "cuda" if torch.cuda.is_available() else "cpu"
 model = AutoModel.from_pretrained(
     'jienengchen/ViTamin-XL-384px',
+    trust_remote_code=True).to(device).eval()
 image = Image.open('./image.png').convert('RGB')
 image_processor = CLIPImageProcessor.from_pretrained('jienengchen/ViTamin-XL-384px')
 pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
 pixel_values = pixel_values.to(torch.bfloat16).cuda()
+tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K')
+text = tokenizer(["a photo of vitamin", "a dog", "a cat"]).to(device)
+with torch.no_grad(), torch.cuda.amp.autocast():
+    image_features, text_features, logit_scale = model(pixel_values, text)
+    text_probs = (100.0 * image_features @ text_features.to(torch.float).T).softmax(dim=-1)
+print("Label probs:", text_probs)
 ```
 ## Main Results with CLIP Pre-training on DataComp-1B