Spaces:

Ramos-Ramos
/

distillclip

Runtime error

App Files Files Community

patrickramos commited on Jul 5, 2023

Commit

b991b4f

•

1 Parent(s): b66e6ca

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -179

app.py CHANGED Viewed

@@ -1,20 +1,14 @@
-from transformers import CLIPModel, CLIPProcessor
-MODEL_ID = 'openai/clip-vit-base-patch32' #@param {'type': 'string'}
-LOAD_IN_8BIT = False #@param {'type': 'boolean'}
-BATCH_SIZE = 1024 #@param {'type': 'integer'}
-REVISION = '' #@param {'type': 'string'}
-REVISION = None if not REVISION else REVISION
-from transformers import CLIPConfig
-from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file
 import os
 from huggingface_hub import login
 login(os.environ['hf_token'])
 def load_distillclip(model_id, revision=None):
   ckpt_path = hf_hub_download(repo_id=model_id, filename="model.safetensors", revision=revision)
   config = CLIPConfig.from_pretrained(model_id)
@@ -27,34 +21,21 @@ def load_distillclip(model_id, revision=None):
         bias=True,
     )
   model.vision_model.pre_layrnorm = nn.Identity()
-  # model.vision_model.post_layernorm = nn.Identity()
   print(model.load_state_dict({k.removeprefix('student.'): v for k, v in load_file(ckpt_path).items()}))
-  # model.load_state_dict(load_file(ckpt_path))
   return model
 from torch import nn
-from accelerate import init_empty_weights, infer_auto_device_map
-from transformers import CLIPModel, CLIPProcessor
 from einops import reduce
 class ZeroShotCLIP(nn.Module):
-  def __init__(self, model_id=None, model=None, processor=None,classes=[], templates=[], load_in_8bit=False):
     super().__init__()
-    self.load_in_8bit = load_in_8bit
-    if model is not None and processor is not None:
-      self.model = model.eval()
-      self.processor = processor
-    else:
-      if load_in_8bit:
-        with init_empty_weights():
-          dummy = CLIPModel.from_pretrained(model_id)
-          device_map = infer_auto_device_map(dummy)
-          del dummy
-        self.model = CLIPModel.from_pretrained(model_id, load_in_8bit=True, device_map=device_map)
-      else:
-        self.model = CLIPModel.from_pretrained(model_id).eval()
-      self.processor = CLIPProcessor.from_pretrained(model_id)
     self.classes = classes
     self.templates = templates
     self._init_weights()
@@ -63,8 +44,6 @@ class ZeroShotCLIP(nn.Module):
   def _init_weights(self):
     self.model.eval()
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    if not self.load_in_8bit:
-      self.model = self.model.to(device)
     weights = []
     for classname in tqdm(self.classes):
       prompts = [template.format(classname) for template in self.templates]
@@ -76,159 +55,54 @@ class ZeroShotCLIP(nn.Module):
       weights.append(embeddings)
     weights = torch.stack(weights)
     self.register_buffer('weights', weights)
-    if not self.load_in_8bit:
-      self.model = self.model.cpu()
   @torch.no_grad()
   def forward(self, pixel_values):
     x = self.model.get_image_features(pixel_values=pixel_values)
     x /= x.norm(dim=-1, keepdim=True)
-    return x.mm(self.weights.t())
   def preprocess_and_forward(self, x):
-    x = self.processor(images=x)
-    return self(x)
-  def to(self, *args, **kwargs):
-    if not self.load_in_8bit:
-      return super().to(*args, **kwargs)
-    else:
-      self.weights = self.weights.to(*args, **kwargs)
-      return self
-model = load_distillclip('Ramos-Ramos/distillclip-different-moon-37')
-processor = CLIPProcessor.from_pretrained(MODEL_ID)
-pipe = pipeline("zero-shot-image-classification", model=model, feature_extractor=processor.image_processor, tokenizer=processor.tokenizer)
-cifar_templates = [
-    'a photo of a {}.',
-    'a blurry photo of a {}.',
-    'a black and white photo of a {}.',
-    'a low contrast photo of a {}.',
-    'a high contrast photo of a {}.',
-    'a bad photo of a {}.',
-    'a good photo of a {}.',
-    'a photo of a small {}.',
-    'a photo of a big {}.',
-    'a photo of the {}.',
-    'a blurry photo of the {}.',
-    'a black and white photo of the {}.',
-    'a low contrast photo of the {}.',
-    'a high contrast photo of the {}.',
-    'a bad photo of the {}.',
-    'a good photo of the {}.',
-    'a photo of the small {}.',
-    'a photo of the big {}.',
-]
-imagenet_templates = [
-    'a bad photo of a {}.',
-    'a photo of many {}.',
-    'a sculpture of a {}.',
-    'a photo of the hard to see {}.',
-    'a low resolution photo of the {}.',
-    'a rendering of a {}.',
-    'graffiti of a {}.',
-    'a bad photo of the {}.',
-    'a cropped photo of the {}.',
-    'a tattoo of a {}.',
-    'the embroidered {}.',
-    'a photo of a hard to see {}.',
-    'a bright photo of a {}.',
-    'a photo of a clean {}.',
-    'a photo of a dirty {}.',
-    'a dark photo of the {}.',
-    'a drawing of a {}.',
-    'a photo of my {}.',
-    'the plastic {}.',
-    'a photo of the cool {}.',
-    'a close-up photo of a {}.',
-    'a black and white photo of the {}.',
-    'a painting of the {}.',
-    'a painting of a {}.',
-    'a pixelated photo of the {}.',
-    'a sculpture of the {}.',
-    'a bright photo of the {}.',
-    'a cropped photo of a {}.',
-    'a plastic {}.',
-    'a photo of the dirty {}.',
-    'a jpeg corrupted photo of a {}.',
-    'a blurry photo of the {}.',
-    'a photo of the {}.',
-    'a good photo of the {}.',
-    'a rendering of the {}.',
-    'a {} in a video game.',
-    'a photo of one {}.',
-    'a doodle of a {}.',
-    'a close-up photo of the {}.',
-    'a photo of a {}.',
-    'the origami {}.',
-    'the {} in a video game.',
-    'a sketch of a {}.',
-    'a doodle of the {}.',
-    'a origami {}.',
-    'a low resolution photo of a {}.',
-    'the toy {}.',
-    'a rendition of the {}.',
-    'a photo of the clean {}.',
-    'a photo of a large {}.',
-    'a rendition of a {}.',
-    'a photo of a nice {}.',
-    'a photo of a weird {}.',
-    'a blurry photo of a {}.',
-    'a cartoon {}.',
-    'art of a {}.',
-    'a sketch of the {}.',
-    'a embroidered {}.',
-    'a pixelated photo of a {}.',
-    'itap of the {}.',
-    'a jpeg corrupted photo of the {}.',
-    'a good photo of a {}.',
-    'a plushie {}.',
-    'a photo of the nice {}.',
-    'a photo of the small {}.',
-    'a photo of the weird {}.',
-    'the cartoon {}.',
-    'art of the {}.',
-    'a drawing of the {}.',
-    'a photo of the large {}.',
-    'a black and white photo of a {}.',
-    'the plushie {}.',
-    'a dark photo of a {}.',
-    'itap of a {}.',
-    'graffiti of the {}.',
-    'a toy {}.',
-    'itap of my {}.',
-    'a photo of a cool {}.',
-    'a photo of a small {}.',
-    'a tattoo of the {}.',
-]
-dashcam_templates = [
-    'a dashcam recording of {}.',
-    'a picture of {}.',
-    'a recording of {}.'
-]
-stl10_templates = [
-    'a photo of a {}.',
-    'a photo of the {}.',
-]
-oxfordpets_templates = [
-    'a photo of a {}, a type of pet.',
-]
-def predict(image, texts):
-  texts = texts.split(', ')
-  out = pipe(image, candidate_labels=texts)
-  return {d['label']: d['score'] for d in out}
 demo = gr.Interface(
-  fn=predict,
-  inputs=[gr.Image(type='pil'), gr.Textbox(label='comma separated labels'), gr.Dropwdown(['CIFAR', 'ImageNet','STL-10', 'Oxford Pets', 'Dashcam'], label='text templates')],
-  outputs='label',
 )
-demo.launch(debug=True, share=True)

 import os
 from huggingface_hub import login
 login(os.environ['hf_token'])
+from transformers import CLIPConfig, CLIPModel
+from torch import nn
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
 def load_distillclip(model_id, revision=None):
   ckpt_path = hf_hub_download(repo_id=model_id, filename="model.safetensors", revision=revision)
   config = CLIPConfig.from_pretrained(model_id)
         bias=True,
     )
   model.vision_model.pre_layrnorm = nn.Identity()
   print(model.load_state_dict({k.removeprefix('student.'): v for k, v in load_file(ckpt_path).items()}))
   return model
+import torch
 from torch import nn
 from einops import reduce
+from tqdm.auto import tqdm
 class ZeroShotCLIP(nn.Module):
+  def __init__(self, model=None, processor=None, classes=[], templates=[], load_in_8bit=False):
     super().__init__()
+    self.model = model.eval()
+    self.processor = processor
     self.classes = classes
     self.templates = templates
     self._init_weights()
   def _init_weights(self):
     self.model.eval()
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     weights = []
     for classname in tqdm(self.classes):
       prompts = [template.format(classname) for template in self.templates]
       weights.append(embeddings)
     weights = torch.stack(weights)
     self.register_buffer('weights', weights)
   @torch.no_grad()
   def forward(self, pixel_values):
     x = self.model.get_image_features(pixel_values=pixel_values)
     x /= x.norm(dim=-1, keepdim=True)
+    return x.mm(self.weights.t()) * 100.00000762939453
   def preprocess_and_forward(self, x):
+    x = self.processor(images=x, return_tensors='pt')
+    return self(x['pixel_values'])
+from transformers import CLIPProcessor
+model = load_distillclip('Ramos-Ramos/distillclip')
+processor = CLIPProcessor.from_pretrained('Ramos-Ramos/distillclip')
+def infer(image, classes, templates):
+  classes = [label.strip() for label in classes.split(',')]
+  print(classes)
+  templates = [template.strip() for template in templates.split(';')]
+  print(templates)
+  clip = ZeroShotCLIP(model=model, processor=processor, classes=classes, templates=templates)
+  preds = clip.preprocess_and_forward(image).softmax(dim=1).flatten()
+  return {label: score.item() for label, score in zip(classes, preds)}
+import gradio as gr
+title = 'DistillCLIP'
+description = 'Zero-shot image classification demo with DistillCLIP'
+article = '''DistillCLIP is a distilled version of [CLIP-ViT/B-32](https://huggingface.co/openai/clip-vit-base-patch32).
+Please refer to the [DistillCLIP model card](https://huggingface.co/Ramos-Ramos/distillclip) for more details on DistillCLIP.
+Note: As multiplying logits by a temperature prior to the softmax can better distinguish final scores, we multiply DistillCLIP's text-image similarity scores by the teacher CLIP's temperature.'''
 demo = gr.Interface(
+    fn=infer,
+    inputs=[
+        gr.Image(label='Image', type='pil'),
+        gr.Textbox(label='Classes', placeholder='cat, truck', info='Classes for classification. Separate classes with commas.'),
+        gr.Textbox(label='Prompt/s', placeholder='a photo of a {}.; a blurry photo of a {}.', info='Prompt templates. Use "{}" as placeholder for class. Separate prompts with semi-colons.')
+    ],
+    outputs=gr.Label(label='Class scores'),
+    title=title,
+    description=description,
+    article=article
 )
+demo.launch()