Commit
·
0f983db
1
Parent(s):
b61121e
Upload model
Browse files- config.json +4 -3
- modelling.py +8 -2
config.json
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
{
|
2 |
-
"_commit_hash": "
|
3 |
-
"_name_or_path": "flavour/
|
4 |
"architectures": [
|
5 |
"VTDEModel"
|
6 |
],
|
7 |
"auto_map": {
|
8 |
"AutoConfig": "modelling.VTDEConfig",
|
9 |
-
"
|
|
|
10 |
},
|
11 |
"logit_scale_init_value": 2.6592,
|
12 |
"model_type": "vtde",
|
|
|
1 |
{
|
2 |
+
"_commit_hash": "b61121e506fb7330d5fe093287b3ab12b3c8e564",
|
3 |
+
"_name_or_path": "flavour/clippy-dinov2-small-jina-embedding-t-en-v1",
|
4 |
"architectures": [
|
5 |
"VTDEModel"
|
6 |
],
|
7 |
"auto_map": {
|
8 |
"AutoConfig": "modelling.VTDEConfig",
|
9 |
+
"AutoModel": "modelling.VTDEModel",
|
10 |
+
"AutoModelForZeroShotImageClassification": "flavour/clippy-dinov2-small-jina-embedding-t-en-v1--modelling.VTDEModel"
|
11 |
},
|
12 |
"logit_scale_init_value": 2.6592,
|
13 |
"model_type": "vtde",
|
modelling.py
CHANGED
@@ -6,8 +6,9 @@ __all__ = ['VTDEConfig', 'VTDEModel']
|
|
6 |
# %% ../notebooks/12_modelling.ipynb 1
|
7 |
from transformers.models.clip.modeling_clip import CLIPOutput, clip_loss
|
8 |
from typing import Optional, Tuple, Union
|
9 |
-
from transformers import
|
10 |
import torch
|
|
|
11 |
|
12 |
class VTDEConfig(VisionTextDualEncoderConfig):
|
13 |
model_type = "vtde"
|
@@ -20,11 +21,16 @@ class VTDEConfig(VisionTextDualEncoderConfig):
|
|
20 |
pooling_mode in ['mean', 'max', 'cls']
|
21 |
https://arxiv.org/pdf/2210.09996.pdf
|
22 |
https://github.com/kahnchana/clippy/blob/3c102c29c32f7c66c6e52e09b795fe9c061bbb03/src/open_clip/hf_model.py#L56
|
|
|
|
|
23 |
"""
|
24 |
self.text_pooling_mode = text_pooling_mode
|
25 |
self.vision_pooling_mode = vision_pooling_mode
|
26 |
super().__init__(projection_dim, logit_scale_init_value, **kwargs)
|
27 |
|
|
|
|
|
|
|
28 |
class VTDEModel(VisionTextDualEncoderModel):
|
29 |
config_class = VTDEConfig
|
30 |
base_model_prefix = "vtde"
|
@@ -170,5 +176,5 @@ class VTDEModel(VisionTextDualEncoderModel):
|
|
170 |
vision_model_output=image_embeds,
|
171 |
)
|
172 |
|
173 |
-
|
174 |
VTDEModel.register_for_auto_class("AutoModel")
|
|
|
6 |
# %% ../notebooks/12_modelling.ipynb 1
|
7 |
from transformers.models.clip.modeling_clip import CLIPOutput, clip_loss
|
8 |
from typing import Optional, Tuple, Union
|
9 |
+
from transformers import PreTrainedModel, VisionTextDualEncoderModel
|
10 |
import torch
|
11 |
+
from transformers import VisionTextDualEncoderConfig
|
12 |
|
13 |
class VTDEConfig(VisionTextDualEncoderConfig):
|
14 |
model_type = "vtde"
|
|
|
21 |
pooling_mode in ['mean', 'max', 'cls']
|
22 |
https://arxiv.org/pdf/2210.09996.pdf
|
23 |
https://github.com/kahnchana/clippy/blob/3c102c29c32f7c66c6e52e09b795fe9c061bbb03/src/open_clip/hf_model.py#L56
|
24 |
+
also
|
25 |
+
https://arxiv.org/pdf/2301.07836.pdf
|
26 |
"""
|
27 |
self.text_pooling_mode = text_pooling_mode
|
28 |
self.vision_pooling_mode = vision_pooling_mode
|
29 |
super().__init__(projection_dim, logit_scale_init_value, **kwargs)
|
30 |
|
31 |
+
VTDEConfig.register_for_auto_class()
|
32 |
+
|
33 |
+
|
34 |
class VTDEModel(VisionTextDualEncoderModel):
|
35 |
config_class = VTDEConfig
|
36 |
base_model_prefix = "vtde"
|
|
|
176 |
vision_model_output=image_embeds,
|
177 |
)
|
178 |
|
179 |
+
|
180 |
VTDEModel.register_for_auto_class("AutoModel")
|