bwang0911 commited on
Commit
bb1ff9e
·
verified ·
1 Parent(s): de829f3

Upload 9 files

Browse files
README.md ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - feature-extraction
4
+ - sentence-similarity
5
+ - mteb
6
+ - clip
7
+ - vision
8
+ - transformers.js
9
+ language: en
10
+ inference: false
11
+ license: apache-2.0
12
+ library_name: transformers
13
+ ---
14
+
15
+ <br><br>
16
+
17
+ <p align="center">
18
+ <img src="https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/603763514de52ff951d89793/AFoybzd5lpBQXEBrQHuTt.png?w=200&h=200&f=face" alt="Finetuner logo: Finetuner helps you to create experiments in order to improve embeddings on search tasks. It accompanies you to deliver the last mile of performance-tuning for neural search applications." width="150px">
19
+ </p>
20
+
21
+
22
+ <p align="center">
23
+ <b>The embedding set trained by <a href="https://jina.ai/"><b>Jina AI</b></a>.</b>
24
+ </p>
25
+
26
+ <p align="center">
27
+ <b>Jina CLIP: your CLIP model is also your text retriever!</b>
28
+ </p>
29
+
30
+
31
+ ## Intended Usage & Model Info
32
+
33
+ `jina-clip-v1` is a state-of-the-art English **multimodal (text-image) embedding model**.
34
+
35
+ Traditional text embedding models, such as [jina-embeddings-v2-base-en](https://huggingface.co/jinaai/jina-embeddings-v2-base-en), excel in text-to-text retrieval but incapable of cross-modal tasks. Models like [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) effectively align image and text embeddings but are not optimized for text-to-text retrieval due to their training methodologies and context limitations.
36
+
37
+ `jina-clip-v1` bridges this gap by offering robust performance in both domains.
38
+ Its text component matches the retrieval efficiency of `jina-embeddings-v2-base-en`, while its overall architecture sets a new benchmark for cross-modal retrieval.
39
+ This dual capability makes it an excellent tool for multimodal retrieval-augmented generation (MuRAG) applications, enabling seamless text-to-text and text-to-image searches within a single model.
40
+
41
+
42
+ ## Data & Parameters
43
+
44
+ [Check out our paper](https://arxiv.org/abs/2405.20204)
45
+
46
+ ## Usage
47
+
48
+ 1. The easiest way to starting using jina-clip-v1-en is to use Jina AI's [Embeddings API](https://jina.ai/embeddings/).
49
+ 2. Alternatively, you can use Jina CLIP directly via transformers package.
50
+
51
+ ```python
52
+ !pip install transformers einops timm pillow
53
+ from transformers import AutoModel
54
+
55
+ # Initialize the model
56
+ model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
57
+
58
+ # New meaningful sentences
59
+ sentences = ['A blue cat', 'A red cat']
60
+
61
+ # Public image URLs
62
+ image_urls = [
63
+ 'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
64
+ 'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
65
+ ]
66
+
67
+ # Encode text and images
68
+ text_embeddings = model.encode_text(sentences)
69
+ image_embeddings = model.encode_image(image_urls) # also accepts PIL.image, local filenames, dataURI
70
+
71
+ # Compute similarities
72
+ print(text_embeddings[0] @ text_embeddings[1].T) # text embedding similarity
73
+ print(text_embeddings[0] @ image_embeddings[0].T) # text-image cross-modal similarity
74
+ print(text_embeddings[0] @ image_embeddings[1].T) # text-image cross-modal similarity
75
+ print(text_embeddings[1] @ image_embeddings[0].T) # text-image cross-modal similarity
76
+ print(text_embeddings[1] @ image_embeddings[1].T)# text-image cross-modal similarity
77
+ ```
78
+
79
+ 3. JavaScript developers can use Jina CLIP via the [Transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install Transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
80
+
81
+ ```js
82
+ import { AutoTokenizer, CLIPTextModelWithProjection, AutoProcessor, CLIPVisionModelWithProjection, RawImage, cos_sim } from '@xenova/transformers';
83
+
84
+ // Load tokenizer and text model
85
+ const tokenizer = await AutoTokenizer.from_pretrained('jinaai/jina-clip-v1');
86
+ const text_model = await CLIPTextModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
87
+
88
+ // Load processor and vision model
89
+ const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch32');
90
+ const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
91
+
92
+ // Run tokenization
93
+ const texts = ['A blue cat', 'A red cat'];
94
+ const text_inputs = tokenizer(texts, { padding: true, truncation: true });
95
+
96
+ // Compute text embeddings
97
+ const { text_embeds } = await text_model(text_inputs);
98
+
99
+ // Read images and run processor
100
+ const urls = [
101
+ 'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
102
+ 'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
103
+ ];
104
+ const image = await Promise.all(urls.map(url => RawImage.read(url)));
105
+ const image_inputs = await processor(image);
106
+
107
+ // Compute vision embeddings
108
+ const { image_embeds } = await vision_model(image_inputs);
109
+
110
+ // Compute similarities
111
+ console.log(cos_sim(text_embeds[0].data, text_embeds[1].data)) // text embedding similarity
112
+ console.log(cos_sim(text_embeds[0].data, image_embeds[0].data)) // text-image cross-modal similarity
113
+ console.log(cos_sim(text_embeds[0].data, image_embeds[1].data)) // text-image cross-modal similarity
114
+ console.log(cos_sim(text_embeds[1].data, image_embeds[0].data)) // text-image cross-modal similarity
115
+ console.log(cos_sim(text_embeds[1].data, image_embeds[1].data)) // text-image cross-modal similarity
116
+ ```
117
+
118
+ ## Performance
119
+
120
+ ### Text-Image Retrieval
121
+
122
+ | Name | Flickr Image Retr. R@1 | Flickr Image Retr. R@5 | Flickr Text Retr. R@1 | Flickr Text Retr. R@5 |
123
+ |------------------|-------------------------|-------------------------|-----------------------|-----------------------|
124
+ | ViT-B-32 | 0.597 | 0.8398 | 0.781 | 0.938 |
125
+ | ViT-B-16 | 0.6216 | 0.8572 | 0.822 | 0.966 |
126
+ | jina-clip | 0.6748 | 0.8902 | 0.811 | 0.965 |
127
+
128
+
129
+ | Name | MSCOCO Image Retr. R@1 | MSCOCO Image Retr. R@5 | MSCOCO Text Retr. R@1 | MSCOCO Text Retr. R@5 |
130
+ |------------------|-------------------------|-------------------------|-----------------------|-----------------------|
131
+ | ViT-B-32 | 0.342 | 0.6001 | 0.5234 | 0.7634 |
132
+ | ViT-B-16 | 0.3309 | 0.5842 | 0.5242 | 0.767 |
133
+ | jina-clip | 0.4111 | 0.6644 | 0.5544 | 0.7904 |
134
+
135
+ ### Text-Text Retrieval
136
+
137
+ | Name | STS12 | STS15 | STS17 | STS13 | STS14 | STS16 | STS22 | STSBenchmark | SummEval |
138
+ |-----------------------|--------|--------|--------|--------|--------|--------|--------|--------------|----------|
139
+ | jina-embeddings-v2 | 0.7427 | 0.8755 | 0.8888 | 0.833 | 0.7917 | 0.836 | 0.6346 | 0.8404 | 0.3056 |
140
+ | jina-clip | 0.7352 | 0.8746 | 0.8976 | 0.8323 | 0.7868 | 0.8377 | 0.6583 | 0.8493 | 0.3048 |
141
+
142
+
143
+ | Name | ArguAna | FiQA2018 | NFCorpus | Quora | SCIDOCS | SciFact | TRECCOVID |
144
+ |--------------------|---------|----------|----------|-------|---------|---------|-----------|
145
+ | jina-embeddings-v2 | 0.4418 | 0.4158 | 0.3245 | 0.882 | 0.1986 | 0.6668 | 0.6591 |
146
+ | jina-clip | 0.4933 | 0.3827 | 0.3352 | 0.8789| 0.2024 | 0.6734 | 0.7161 |
147
+
148
+ ## Contact
149
+
150
+ Join our [Discord community](https://discord.jina.ai) and chat with other community members about ideas.
151
+
152
+ ## Citation
153
+
154
+ If you find `jina-clip-v1` useful in your research, please cite the following paper:
155
+
156
+ ```bibtex
157
+ @misc{2405.20204,
158
+ Author = {Andreas Koukounas and Georgios Mastrapas and Michael Günther and Bo Wang and Scott Martens and Isabelle Mohr and Saba Sturua and Mohammad Kalim Akram and Joan Fontanals Martínez and Saahil Ognawala and Susana Guzman and Maximilian Werk and Nan Wang and Han Xiao},
159
+ Title = {Jina CLIP: Your CLIP Model Is Also Your Text Retriever},
160
+ Year = {2024},
161
+ Eprint = {arXiv:2405.20204},
162
+ }
163
+ ```
164
+
165
+ ## FAQ
166
+
167
+ ### I encounter this problem, what should I do?
168
+
169
+ ```
170
+ ValueError: The model class you are passing has a `config_class` attribute that is not consistent with the config class you passed (model has <class 'transformers_modules.jinaai.jina-clip-implementation.7f069e2d54d609ef1ad2eb578c7bf07b5a51de41.configuration_clip.JinaCLIPConfig'> and you passed <class 'transformers_modules.jinaai.jina-clip-implementation.7f069e2d54d609ef1ad2eb578c7bf07b5a51de41.configuration_cli.JinaCLIPConfig'>. Fix one of those so they match!
171
+ ```
172
+
173
+ There was a bug in Transformers library between 4.40.x to 4.41.1. You can update transformers to >4.41.2 or <=4.40.0
174
+
175
+ ### Given one query, how can I merge its text-text and text-image cosine similarity?
176
+
177
+ Our emperical study shows that text-text cosine similarity is normally larger than text-image cosine similarity!
178
+ If you want to merge two scores, we recommended 2 ways:
179
+
180
+ 1. weighted average of text-text sim and text-image sim:
181
+
182
+ ```python
183
+ combined_scores = sim(text, text) + lambda * sim(text, image) # optimal lambda depends on your dataset, but in general lambda=2 can be a good choice.
184
+ ```
185
+
186
+ 2. apply z-score normalization before merging scores:
187
+
188
+ ```python
189
+ # pseudo code
190
+ query_document_mean = np.mean(cos_sim_text_texts)
191
+ query_document_std = np.std(cos_sim_text_texts)
192
+ text_image_mean = np.mean(cos_sim_text_images)
193
+ text_image_std = np.std(cos_sim_text_images)
194
+
195
+ query_document_sim_normalized = (cos_sim_query_documents - query_document_mean) / query_document_std
196
+ text_image_sim_normalized = (cos_sim_text_images - text_image_mean) / text_image_std
197
+ ```
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "jinaai/jina-clip-v1",
3
+ "add_projections": false,
4
+ "architectures": [
5
+ "JinaCLIPModel"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "jinaai/jina-clip-implementation--configuration_clip.JinaCLIPConfig",
9
+ "AutoModel": "jinaai/jina-clip-implementation--modeling_clip.JinaCLIPModel"
10
+ },
11
+ "initializer_factor": 1.0,
12
+ "logit_scale_init_value": 2.6592,
13
+ "model_type": "jina_clip",
14
+ "projection_dim": 768,
15
+ "text_config": {
16
+ "_name_or_path": "",
17
+ "embed_dim": 768,
18
+ "hf_model_config_kwargs": {
19
+ "use_flash_attn": false
20
+ },
21
+ "hf_model_name_or_path": "jinaai/jina-bert-flash-implementation",
22
+ "model_type": "jina_clip_text",
23
+ "output_attentions": false,
24
+ "output_hidden_states": false,
25
+ "output_scores": false,
26
+ "pad_token_id": null,
27
+ "pooler_type": "mean_pooler",
28
+ "proj_bias": false,
29
+ "proj_type": null,
30
+ "transformers_version": "4.36.2",
31
+ "use_bfloat16": false
32
+ },
33
+ "torch_dtype": "float32",
34
+ "transformers_version": null,
35
+ "vision_config": {
36
+ "_name_or_path": "",
37
+ "embed_dim": 768,
38
+ "fused_layer_norm": false,
39
+ "head_width": 64,
40
+ "image_size": 224,
41
+ "intp_freq": false,
42
+ "layers": 12,
43
+ "ls_init_value": null,
44
+ "mlp_ratio": 2.6667,
45
+ "model_type": "jina_clip_vision",
46
+ "naive_swiglu": true,
47
+ "output_attentions": false,
48
+ "output_hidden_states": false,
49
+ "output_scores": false,
50
+ "pad_token_id": null,
51
+ "patch_dropout": 0.1,
52
+ "patch_size": 16,
53
+ "post_norm": false,
54
+ "prefix": null,
55
+ "problem_type": null,
56
+ "proj_type": null,
57
+ "pruned_heads": {},
58
+ "pt_hw_seq_len": 14,
59
+ "qkv_bias": true,
60
+ "remove_invalid_values": false,
61
+ "return_dict": true,
62
+ "return_dict_in_generate": false,
63
+ "rope_embeddings": true,
64
+ "subln": true,
65
+ "tie_word_embeddings": true,
66
+ "transformers_version": "4.36.2",
67
+ "use_bfloat16": false,
68
+ "width": 768,
69
+ "x_attention": false
70
+ }
71
+ }
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5140d6df851d217296a10b3961ece2850d22b35bff37af948f2d9db33ae4aec2
3
+ size 890733860
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "jinaai/jina-clip-implementation--processing_clip.JinaCLIPImageProcessor",
4
+ "AutoProcessor": "jinaai/jina-clip-implementation--processing_clip.JinaCLIPProcessor"
5
+ },
6
+ "fill_color": 0,
7
+ "image_processor_type": "JinaCLIPImageProcessor",
8
+ "interpolation": "bicubic",
9
+ "mean": [
10
+ 0.48145466,
11
+ 0.4578275,
12
+ 0.40821073
13
+ ],
14
+ "processor_class": "JinaCLIPProcessor",
15
+ "resize_mode": "shortest",
16
+ "size": 224,
17
+ "std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ]
22
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 8192,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff