Add support for AutoModel
Browse files- README.md +28 -4
- config.json +7 -1
- marqo_fashionCLIP.py +70 -0
- model.safetensors +3 -0
- preprocessor_config.json +3 -2
- special_tokens_map.json +1 -1
- tokenizer_config.json +1 -1
README.md
CHANGED
@@ -29,6 +29,32 @@ The model was fine-tuned from ViT-B-16 (laion2b_s34b_b88k).
|
|
29 |
|
30 |
## Usage
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
### OpenCLIP
|
33 |
|
34 |
The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
|
@@ -45,10 +71,8 @@ image = preprocess_val(Image.open("docs/fashion-hippo.png")).unsqueeze(0)
|
|
45 |
text = tokenizer(["a hat", "a t-shirt", "shoes"])
|
46 |
|
47 |
with torch.no_grad(), torch.cuda.amp.autocast():
|
48 |
-
image_features = model.encode_image(image)
|
49 |
-
text_features = model.encode_text(text)
|
50 |
-
image_features /= image_features.norm(dim=-1, keepdim=True)
|
51 |
-
text_features /= text_features.norm(dim=-1, keepdim=True)
|
52 |
|
53 |
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
54 |
|
|
|
29 |
|
30 |
## Usage
|
31 |
|
32 |
+
### Hugging Face
|
33 |
+
|
34 |
+
The model can be loaded with AutoModel by
|
35 |
+
|
36 |
+
```python
|
37 |
+
from transformers import AutoModel, AutoProcessor
|
38 |
+
model = AutoModel.from_pretrained('Marqo/marqo-fashionCLIP', trust_remote_code=True)
|
39 |
+
processor = AutoProcessor.from_pretrained('Marqo/marqo-fashionCLIP', trust_remote_code=True)
|
40 |
+
|
41 |
+
import torch
|
42 |
+
from PIL import Image
|
43 |
+
|
44 |
+
image = [Image.open("docs/fashion-hippo.png")]
|
45 |
+
text = ["a hat", "a t-shirt", "shoes"]
|
46 |
+
processed = processor(text=text, images=image, padding='max_length', return_tensors="pt")
|
47 |
+
|
48 |
+
with torch.no_grad():
|
49 |
+
image_features = model.get_image_features(processed['pixel_values'], normalize=True)
|
50 |
+
text_features = model.get_text_features(processed['input_ids'], normalize=True)
|
51 |
+
|
52 |
+
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
53 |
+
|
54 |
+
print("Label probs:", text_probs)
|
55 |
+
# [0.99990773, 0.00006382, 0.00002847]
|
56 |
+
```
|
57 |
+
|
58 |
### OpenCLIP
|
59 |
|
60 |
The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
|
|
|
71 |
text = tokenizer(["a hat", "a t-shirt", "shoes"])
|
72 |
|
73 |
with torch.no_grad(), torch.cuda.amp.autocast():
|
74 |
+
image_features = model.encode_image(image, normalize=True)
|
75 |
+
text_features = model.encode_text(text, normalize=True)
|
|
|
|
|
76 |
|
77 |
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
78 |
|
config.json
CHANGED
@@ -1,3 +1,9 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"model_type": "clip"
|
3 |
-
}
|
|
|
1 |
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoConfig": "marqo_fashionCLIP.MarqoFashionCLIPConfig",
|
4 |
+
"AutoModel": "marqo_fashionCLIP.MarqoFashionCLIP",
|
5 |
+
"AutoProcessor": "marqo_fashionCLIP.CLIPProcessor"
|
6 |
+
},
|
7 |
+
"open_clip_model_name": "hf-hub:Marqo/marqo-fashionCLIP",
|
8 |
"model_type": "clip"
|
9 |
+
}
|
marqo_fashionCLIP.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from open_clip import create_model
|
3 |
+
from transformers import PretrainedConfig, PreTrainedModel, CLIPProcessor
|
4 |
+
from transformers.models.clip.modeling_clip import CLIPOutput
|
5 |
+
from typing import Optional, Tuple, Union
|
6 |
+
|
7 |
+
class MarqoFashionCLIPConfig(PretrainedConfig):
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
open_clip_model_name: str = "",
|
11 |
+
**kwargs,
|
12 |
+
):
|
13 |
+
super().__init__(**kwargs)
|
14 |
+
self.open_clip_model_name = open_clip_model_name
|
15 |
+
|
16 |
+
|
17 |
+
class MarqoFashionCLIP(PreTrainedModel):
|
18 |
+
config_class = MarqoFashionCLIPConfig
|
19 |
+
|
20 |
+
def __init__(self, config: MarqoFashionCLIPConfig):
|
21 |
+
super().__init__(config)
|
22 |
+
self.config = config
|
23 |
+
self.model = create_model(config.open_clip_model_name, output_dict=True)
|
24 |
+
self.model.to(self.device)
|
25 |
+
self.model.eval()
|
26 |
+
|
27 |
+
def get_image_features(
|
28 |
+
self,
|
29 |
+
pixel_values: torch.FloatTensor,
|
30 |
+
normalize: bool = False,
|
31 |
+
**kwargs
|
32 |
+
) -> torch.FloatTensor:
|
33 |
+
|
34 |
+
with torch.inference_mode():
|
35 |
+
image_features = self.model.encode_image(pixel_values, normalize=normalize)
|
36 |
+
return image_features
|
37 |
+
|
38 |
+
def get_text_features(
|
39 |
+
self,
|
40 |
+
input_ids: torch.Tensor,
|
41 |
+
normalize: bool = False,
|
42 |
+
**kwargs
|
43 |
+
) -> torch.FloatTensor:
|
44 |
+
|
45 |
+
with torch.inference_mode():
|
46 |
+
text_features = self.model.encode_text(input_ids, normalize=normalize)
|
47 |
+
return text_features
|
48 |
+
|
49 |
+
def forward(
|
50 |
+
self,
|
51 |
+
input_ids: Optional[torch.LongTensor] = None,
|
52 |
+
pixel_values: Optional[torch.FloatTensor] = None,
|
53 |
+
return_dict: Optional[bool] = None,
|
54 |
+
) -> Union[Tuple, CLIPOutput]:
|
55 |
+
|
56 |
+
vision_outputs = self.get_image_features(pixel_values=pixel_values, normalize=True)
|
57 |
+
text_outputs = self.get_text_features(input_ids=input_ids, normalize=True)
|
58 |
+
|
59 |
+
logits_per_text = text_outputs @ vision_outputs.T
|
60 |
+
logits_per_image = logits_per_text.T
|
61 |
+
|
62 |
+
if not return_dict:
|
63 |
+
return logits_per_image, logits_per_text, text_outputs, vision_outputs
|
64 |
+
|
65 |
+
return CLIPOutput(
|
66 |
+
logits_per_image=logits_per_image,
|
67 |
+
logits_per_text=logits_per_text,
|
68 |
+
text_embeds=text_outputs,
|
69 |
+
image_embeds=vision_outputs
|
70 |
+
)
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9491b78dc04d0d18828075a2654c74f3c9134151f6d85ee8e8a0b022f24cc598
|
3 |
+
size 598518820
|
preprocessor_config.json
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"crop_size": {
|
3 |
"height": 224,
|
4 |
"width": 224
|
@@ -6,7 +9,6 @@
|
|
6 |
"do_center_crop": true,
|
7 |
"do_convert_rgb": true,
|
8 |
"do_normalize": true,
|
9 |
-
"do_rescale": true,
|
10 |
"do_resize": true,
|
11 |
"feature_extractor_type": "CLIPFeatureExtractor",
|
12 |
"image_mean": [
|
@@ -21,7 +23,6 @@
|
|
21 |
0.27577711
|
22 |
],
|
23 |
"resample": 3,
|
24 |
-
"rescale_factor": 0.00392156862745098,
|
25 |
"size": {
|
26 |
"shortest_edge": 224
|
27 |
}
|
|
|
1 |
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoProcessor": "marqo_fashionCLIP.MarqoFashionImageProcessor"
|
4 |
+
},
|
5 |
"crop_size": {
|
6 |
"height": 224,
|
7 |
"width": 224
|
|
|
9 |
"do_center_crop": true,
|
10 |
"do_convert_rgb": true,
|
11 |
"do_normalize": true,
|
|
|
12 |
"do_resize": true,
|
13 |
"feature_extractor_type": "CLIPFeatureExtractor",
|
14 |
"image_mean": [
|
|
|
23 |
0.27577711
|
24 |
],
|
25 |
"resample": 3,
|
|
|
26 |
"size": {
|
27 |
"shortest_edge": 224
|
28 |
}
|
special_tokens_map.json
CHANGED
@@ -14,7 +14,7 @@
|
|
14 |
"single_word": false
|
15 |
},
|
16 |
"pad_token": {
|
17 |
-
"content": "
|
18 |
"lstrip": false,
|
19 |
"normalized": false,
|
20 |
"rstrip": false,
|
|
|
14 |
"single_word": false
|
15 |
},
|
16 |
"pad_token": {
|
17 |
+
"content": "!",
|
18 |
"lstrip": false,
|
19 |
"normalized": false,
|
20 |
"rstrip": false,
|
tokenizer_config.json
CHANGED
@@ -24,7 +24,7 @@
|
|
24 |
"eos_token": "<|endoftext|>",
|
25 |
"errors": "replace",
|
26 |
"model_max_length": 77,
|
27 |
-
"pad_token": "
|
28 |
"tokenizer_class": "CLIPTokenizer",
|
29 |
"unk_token": "<|endoftext|>"
|
30 |
}
|
|
|
24 |
"eos_token": "<|endoftext|>",
|
25 |
"errors": "replace",
|
26 |
"model_max_length": 77,
|
27 |
+
"pad_token": "!",
|
28 |
"tokenizer_class": "CLIPTokenizer",
|
29 |
"unk_token": "<|endoftext|>"
|
30 |
}
|