DavidJung commited on
Commit
8478037
·
1 Parent(s): 6513dfd

Add support for AutoModel

Browse files
README.md CHANGED
@@ -29,6 +29,32 @@ The model was fine-tuned from ViT-B-16 (laion2b_s34b_b88k).
29
 
30
  ## Usage
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  ### OpenCLIP
33
 
34
  The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
@@ -45,10 +71,8 @@ image = preprocess_val(Image.open("docs/fashion-hippo.png")).unsqueeze(0)
45
  text = tokenizer(["a hat", "a t-shirt", "shoes"])
46
 
47
  with torch.no_grad(), torch.cuda.amp.autocast():
48
- image_features = model.encode_image(image)
49
- text_features = model.encode_text(text)
50
- image_features /= image_features.norm(dim=-1, keepdim=True)
51
- text_features /= text_features.norm(dim=-1, keepdim=True)
52
 
53
  text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
54
 
 
29
 
30
  ## Usage
31
 
32
+ ### Hugging Face
33
+
34
+ The model can be loaded with AutoModel by
35
+
36
+ ```python
37
+ from transformers import AutoModel, AutoProcessor
38
+ model = AutoModel.from_pretrained('Marqo/marqo-fashionCLIP', trust_remote_code=True)
39
+ processor = AutoProcessor.from_pretrained('Marqo/marqo-fashionCLIP', trust_remote_code=True)
40
+
41
+ import torch
42
+ from PIL import Image
43
+
44
+ image = [Image.open("docs/fashion-hippo.png")]
45
+ text = ["a hat", "a t-shirt", "shoes"]
46
+ processed = processor(text=text, images=image, padding='max_length', return_tensors="pt")
47
+
48
+ with torch.no_grad():
49
+ image_features = model.get_image_features(processed['pixel_values'], normalize=True)
50
+ text_features = model.get_text_features(processed['input_ids'], normalize=True)
51
+
52
+ text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
53
+
54
+ print("Label probs:", text_probs)
55
+ # [0.99990773, 0.00006382, 0.00002847]
56
+ ```
57
+
58
  ### OpenCLIP
59
 
60
  The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
 
71
  text = tokenizer(["a hat", "a t-shirt", "shoes"])
72
 
73
  with torch.no_grad(), torch.cuda.amp.autocast():
74
+ image_features = model.encode_image(image, normalize=True)
75
+ text_features = model.encode_text(text, normalize=True)
 
 
76
 
77
  text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
78
 
config.json CHANGED
@@ -1,3 +1,9 @@
1
  {
 
 
 
 
 
 
2
  "model_type": "clip"
3
- }
 
1
  {
2
+ "auto_map": {
3
+ "AutoConfig": "marqo_fashionCLIP.MarqoFashionCLIPConfig",
4
+ "AutoModel": "marqo_fashionCLIP.MarqoFashionCLIP",
5
+ "AutoProcessor": "marqo_fashionCLIP.CLIPProcessor"
6
+ },
7
+ "open_clip_model_name": "hf-hub:Marqo/marqo-fashionCLIP",
8
  "model_type": "clip"
9
+ }
marqo_fashionCLIP.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from open_clip import create_model
3
+ from transformers import PretrainedConfig, PreTrainedModel, CLIPProcessor
4
+ from transformers.models.clip.modeling_clip import CLIPOutput
5
+ from typing import Optional, Tuple, Union
6
+
7
+ class MarqoFashionCLIPConfig(PretrainedConfig):
8
+ def __init__(
9
+ self,
10
+ open_clip_model_name: str = "",
11
+ **kwargs,
12
+ ):
13
+ super().__init__(**kwargs)
14
+ self.open_clip_model_name = open_clip_model_name
15
+
16
+
17
+ class MarqoFashionCLIP(PreTrainedModel):
18
+ config_class = MarqoFashionCLIPConfig
19
+
20
+ def __init__(self, config: MarqoFashionCLIPConfig):
21
+ super().__init__(config)
22
+ self.config = config
23
+ self.model = create_model(config.open_clip_model_name, output_dict=True)
24
+ self.model.to(self.device)
25
+ self.model.eval()
26
+
27
+ def get_image_features(
28
+ self,
29
+ pixel_values: torch.FloatTensor,
30
+ normalize: bool = False,
31
+ **kwargs
32
+ ) -> torch.FloatTensor:
33
+
34
+ with torch.inference_mode():
35
+ image_features = self.model.encode_image(pixel_values, normalize=normalize)
36
+ return image_features
37
+
38
+ def get_text_features(
39
+ self,
40
+ input_ids: torch.Tensor,
41
+ normalize: bool = False,
42
+ **kwargs
43
+ ) -> torch.FloatTensor:
44
+
45
+ with torch.inference_mode():
46
+ text_features = self.model.encode_text(input_ids, normalize=normalize)
47
+ return text_features
48
+
49
+ def forward(
50
+ self,
51
+ input_ids: Optional[torch.LongTensor] = None,
52
+ pixel_values: Optional[torch.FloatTensor] = None,
53
+ return_dict: Optional[bool] = None,
54
+ ) -> Union[Tuple, CLIPOutput]:
55
+
56
+ vision_outputs = self.get_image_features(pixel_values=pixel_values, normalize=True)
57
+ text_outputs = self.get_text_features(input_ids=input_ids, normalize=True)
58
+
59
+ logits_per_text = text_outputs @ vision_outputs.T
60
+ logits_per_image = logits_per_text.T
61
+
62
+ if not return_dict:
63
+ return logits_per_image, logits_per_text, text_outputs, vision_outputs
64
+
65
+ return CLIPOutput(
66
+ logits_per_image=logits_per_image,
67
+ logits_per_text=logits_per_text,
68
+ text_embeds=text_outputs,
69
+ image_embeds=vision_outputs
70
+ )
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9491b78dc04d0d18828075a2654c74f3c9134151f6d85ee8e8a0b022f24cc598
3
+ size 598518820
preprocessor_config.json CHANGED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "crop_size": {
3
  "height": 224,
4
  "width": 224
@@ -6,7 +9,6 @@
6
  "do_center_crop": true,
7
  "do_convert_rgb": true,
8
  "do_normalize": true,
9
- "do_rescale": true,
10
  "do_resize": true,
11
  "feature_extractor_type": "CLIPFeatureExtractor",
12
  "image_mean": [
@@ -21,7 +23,6 @@
21
  0.27577711
22
  ],
23
  "resample": 3,
24
- "rescale_factor": 0.00392156862745098,
25
  "size": {
26
  "shortest_edge": 224
27
  }
 
1
  {
2
+ "auto_map": {
3
+ "AutoProcessor": "marqo_fashionCLIP.MarqoFashionImageProcessor"
4
+ },
5
  "crop_size": {
6
  "height": 224,
7
  "width": 224
 
9
  "do_center_crop": true,
10
  "do_convert_rgb": true,
11
  "do_normalize": true,
 
12
  "do_resize": true,
13
  "feature_extractor_type": "CLIPFeatureExtractor",
14
  "image_mean": [
 
23
  0.27577711
24
  ],
25
  "resample": 3,
 
26
  "size": {
27
  "shortest_edge": 224
28
  }
special_tokens_map.json CHANGED
@@ -14,7 +14,7 @@
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "<|endoftext|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
 
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "!",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -24,7 +24,7 @@
24
  "eos_token": "<|endoftext|>",
25
  "errors": "replace",
26
  "model_max_length": 77,
27
- "pad_token": "<|endoftext|>",
28
  "tokenizer_class": "CLIPTokenizer",
29
  "unk_token": "<|endoftext|>"
30
  }
 
24
  "eos_token": "<|endoftext|>",
25
  "errors": "replace",
26
  "model_max_length": 77,
27
+ "pad_token": "!",
28
  "tokenizer_class": "CLIPTokenizer",
29
  "unk_token": "<|endoftext|>"
30
  }