Add support for AutoModel

Browse files

Files changed (7) hide show

README.md +28 -4
config.json +7 -1
marqo_fashionCLIP.py +70 -0
model.safetensors +3 -0
preprocessor_config.json +3 -2
special_tokens_map.json +1 -1
tokenizer_config.json +1 -1

README.md CHANGED Viewed

@@ -29,6 +29,32 @@ The model was fine-tuned from ViT-B-16 (laion2b_s34b_b88k).
 ## Usage
 ### OpenCLIP
 The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
@@ -45,10 +71,8 @@ image = preprocess_val(Image.open("docs/fashion-hippo.png")).unsqueeze(0)
 text = tokenizer(["a hat", "a t-shirt", "shoes"])
 with torch.no_grad(), torch.cuda.amp.autocast():
-    image_features = model.encode_image(image)
-    text_features = model.encode_text(text)
-    image_features /= image_features.norm(dim=-1, keepdim=True)
-    text_features /= text_features.norm(dim=-1, keepdim=True)
     text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

 ## Usage
+### Hugging Face
+The model can be loaded with AutoModel by
+```python
+from transformers import AutoModel, AutoProcessor
+model = AutoModel.from_pretrained('Marqo/marqo-fashionCLIP', trust_remote_code=True)
+processor = AutoProcessor.from_pretrained('Marqo/marqo-fashionCLIP', trust_remote_code=True)
+import torch
+from PIL import Image
+image = [Image.open("docs/fashion-hippo.png")]
+text = ["a hat", "a t-shirt", "shoes"]
+processed = processor(text=text, images=image, padding='max_length', return_tensors="pt")
+with torch.no_grad():
+    image_features = model.get_image_features(processed['pixel_values'], normalize=True)
+    text_features = model.get_text_features(processed['input_ids'], normalize=True)
+    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+print("Label probs:", text_probs)
+# [0.99990773, 0.00006382, 0.00002847]
+```
 ### OpenCLIP
 The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
 text = tokenizer(["a hat", "a t-shirt", "shoes"])
 with torch.no_grad(), torch.cuda.amp.autocast():
+    image_features = model.encode_image(image, normalize=True)
+    text_features = model.encode_text(text, normalize=True)
     text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

config.json CHANGED Viewed

@@ -1,3 +1,9 @@
 {
   "model_type": "clip"
-}

 {
+  "auto_map": {
+		"AutoConfig": "marqo_fashionCLIP.MarqoFashionCLIPConfig",
+		"AutoModel": "marqo_fashionCLIP.MarqoFashionCLIP",
+    "AutoProcessor": "marqo_fashionCLIP.CLIPProcessor"
+	},
+  "open_clip_model_name": "hf-hub:Marqo/marqo-fashionCLIP",
   "model_type": "clip"
+}

marqo_fashionCLIP.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+from open_clip import create_model
+from transformers import PretrainedConfig, PreTrainedModel, CLIPProcessor
+from transformers.models.clip.modeling_clip import CLIPOutput
+from typing import Optional, Tuple, Union
+class MarqoFashionCLIPConfig(PretrainedConfig):
+    def __init__(
+        self,
+        open_clip_model_name: str = "",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.open_clip_model_name = open_clip_model_name
+class MarqoFashionCLIP(PreTrainedModel):
+    config_class = MarqoFashionCLIPConfig
+    def __init__(self, config: MarqoFashionCLIPConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = create_model(config.open_clip_model_name, output_dict=True)
+        self.model.to(self.device)
+        self.model.eval()
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        normalize: bool = False,
+        **kwargs
+        ) -> torch.FloatTensor:
+        with torch.inference_mode():
+            image_features = self.model.encode_image(pixel_values, normalize=normalize)
+        return image_features
+    def get_text_features(
+        self,
+        input_ids: torch.Tensor,
+        normalize: bool = False,
+        **kwargs
+    ) -> torch.FloatTensor:
+        with torch.inference_mode():
+            text_features = self.model.encode_text(input_ids, normalize=normalize)
+        return text_features
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        vision_outputs = self.get_image_features(pixel_values=pixel_values, normalize=True)
+        text_outputs = self.get_text_features(input_ids=input_ids, normalize=True)
+        logits_per_text = text_outputs @ vision_outputs.T
+        logits_per_image = logits_per_text.T
+        if not return_dict:
+            return logits_per_image, logits_per_text, text_outputs, vision_outputs
+        return CLIPOutput(
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_outputs,
+            image_embeds=vision_outputs
+        )

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9491b78dc04d0d18828075a2654c74f3c9134151f6d85ee8e8a0b022f24cc598
+size 598518820

preprocessor_config.json CHANGED Viewed

@@ -1,4 +1,7 @@
 {
   "crop_size": {
     "height": 224,
     "width": 224
@@ -6,7 +9,6 @@
   "do_center_crop": true,
   "do_convert_rgb": true,
   "do_normalize": true,
-  "do_rescale": true,
   "do_resize": true,
   "feature_extractor_type": "CLIPFeatureExtractor",
   "image_mean": [
@@ -21,7 +23,6 @@
     0.27577711
   ],
   "resample": 3,
-  "rescale_factor": 0.00392156862745098,
   "size": {
     "shortest_edge": 224
   }

 {
+  "auto_map": {
+    "AutoProcessor": "marqo_fashionCLIP.MarqoFashionImageProcessor"
+	},
   "crop_size": {
     "height": 224,
     "width": 224
   "do_center_crop": true,
   "do_convert_rgb": true,
   "do_normalize": true,
   "do_resize": true,
   "feature_extractor_type": "CLIPFeatureExtractor",
   "image_mean": [
     0.27577711
   ],
   "resample": 3,
   "size": {
     "shortest_edge": 224
   }

special_tokens_map.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "single_word": false
   },
   "pad_token": {
-    "content": "<|endoftext|>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

     "single_word": false
   },
   "pad_token": {
+    "content": "!",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

tokenizer_config.json CHANGED Viewed

@@ -24,7 +24,7 @@
   "eos_token": "<|endoftext|>",
   "errors": "replace",
   "model_max_length": 77,
-  "pad_token": "<|endoftext|>",
   "tokenizer_class": "CLIPTokenizer",
   "unk_token": "<|endoftext|>"
 }

   "eos_token": "<|endoftext|>",
   "errors": "replace",
   "model_max_length": 77,
+  "pad_token": "!",
   "tokenizer_class": "CLIPTokenizer",
   "unk_token": "<|endoftext|>"
 }