lucasjin commited on
Commit
fc93203
1 Parent(s): 535ebdc

Upload folder using huggingface_hub

Browse files
chinese-clip-vit-large-patch14-336px/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
chinese-clip-vit-large-patch14-336px/README.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - vision
4
+ widget:
5
+ - src: https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/resolve/main/festival.jpg
6
+ candidate_labels: 灯笼, 鞭炮, 对联
7
+ example_title: festival
8
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
9
+ candidate_labels: 音乐表演, 体育运动
10
+ example_title: cat & dog
11
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
12
+ candidate_labels: 梅西, C罗, 马奎尔
13
+ example_title: football
14
+ ---
15
+
16
+
17
+
18
+ # Chinese-CLIP-ViT-Large-Patch14-336px
19
+
20
+ ## Introduction
21
+ This is the large-version of the Chinese CLIP, with ViT-L/14@336px as the image encoder and RoBERTa-wwm-base as the text encoder. Chinese CLIP is a simple implementation of CLIP on a large-scale dataset of around 200 million Chinese image-text pairs. For more details, please refer to our technical report https://arxiv.org/abs/2211.01335 and our official github repo https://github.com/OFA-Sys/Chinese-CLIP (Welcome to star! 🔥🔥)
22
+
23
+ ## Use with the official API
24
+ We provide a simple code snippet to show how to use the API of Chinese-CLIP to compute the image & text embeddings and similarities.
25
+
26
+ ```python
27
+ from PIL import Image
28
+ import requests
29
+ from transformers import ChineseCLIPProcessor, ChineseCLIPModel
30
+
31
+ model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-large-patch14-336px")
32
+ processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-large-patch14-336px")
33
+
34
+ url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
35
+ image = Image.open(requests.get(url, stream=True).raw)
36
+ # Squirtle, Bulbasaur, Charmander, Pikachu in English
37
+ texts = ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]
38
+
39
+ # compute image feature
40
+ inputs = processor(images=image, return_tensors="pt")
41
+ image_features = model.get_image_features(**inputs)
42
+ image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) # normalize
43
+
44
+ # compute text features
45
+ inputs = processor(text=texts, padding=True, return_tensors="pt")
46
+ text_features = model.get_text_features(**inputs)
47
+ text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) # normalize
48
+
49
+ # compute image-text similarity scores
50
+ inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
51
+ outputs = model(**inputs)
52
+ logits_per_image = outputs.logits_per_image # this is the image-text similarity score
53
+ probs = logits_per_image.softmax(dim=1) # probs: [[0.0219, 0.0316, 0.0043, 0.9423]]
54
+ ```
55
+
56
+ However, if you are not satisfied with only using the API, feel free to check our github repo https://github.com/OFA-Sys/Chinese-CLIP for more details about training and inference.
57
+ <br><br>
58
+
59
+ ## Results
60
+ **MUGE Text-to-Image Retrieval**:
61
+ <table border="1" width="100%">
62
+ <tr align="center">
63
+ <th>Setup</th><th colspan="4">Zero-shot</th><th colspan="4">Finetune</th>
64
+ </tr>
65
+ <tr align="center">
66
+ <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>MR</td><td>R@1</td><td>R@5</td><td>R@10</td><td>MR</td>
67
+ </tr>
68
+ <tr align="center">
69
+ <td width="120%">Wukong</td><td>42.7</td><td>69.0</td><td>78.0</td><td>63.2</td><td>52.7</td><td>77.9</td><td>85.6</td><td>72.1</td>
70
+ </tr>
71
+ <tr align="center">
72
+ <td width="120%">R2D2</td><td>49.5</td><td>75.7</td><td>83.2</td><td>69.5</td><td>60.1</td><td>82.9</td><td>89.4</td><td>77.5</td>
73
+ </tr>
74
+ <tr align="center">
75
+ <td width="120%">CN-CLIP</td><td>63.0</td><td>84.1</td><td>89.2</td><td>78.8</td><td>68.9</td><td>88.7</td><td>93.1</td><td>83.6</td>
76
+ </tr>
77
+ </table>
78
+ <br>
79
+
80
+ **Flickr30K-CN Retrieval**:
81
+ <table border="1" width="120%">
82
+ <tr align="center">
83
+ <th>Task</th><th colspan="6">Text-to-Image</th><th colspan="6">Image-to-Text</th>
84
+ </tr>
85
+ <tr align="center">
86
+ <th>Setup</th><th colspan="3">Zero-shot</th><th colspan="3">Finetune</th><th colspan="3">Zero-shot</th><th colspan="3">Finetune</th>
87
+ </tr>
88
+ <tr align="center">
89
+ <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
90
+ </tr>
91
+ <tr align="center">
92
+ <td width="120%">Wukong</td><td>51.7</td><td>78.9</td><td>86.3</td><td>77.4</td><td>94.5</td><td>97.0</td><td>76.1</td><td>94.8</td><td>97.5</td><td>92.7</td><td>99.1</td><td>99.6</td>
93
+ </tr>
94
+ <tr align="center">
95
+ <td width="120%">R2D2</td><td>60.9</td><td>86.8</td><td>92.7</td><td>84.4</td><td>96.7</td><td>98.4</td><td>77.6</td><td>96.7</td><td>98.9</td><td>95.6</td><td>99.8</td><td>100.0</td>
96
+ </tr>
97
+ <tr align="center">
98
+ <td width="120%">CN-CLIP</td><td>71.2</td><td>91.4</td><td>95.5</td><td>83.8</td><td>96.9</td><td>98.6</td><td>81.6</td><td>97.5</td><td>98.8</td><td>95.3</td><td>99.7</td><td>100.0</td>
99
+ </tr>
100
+ </table>
101
+ <br>
102
+
103
+ **COCO-CN Retrieval**:
104
+ <table border="1" width="100%">
105
+ <tr align="center">
106
+ <th>Task</th><th colspan="6">Text-to-Image</th><th colspan="6">Image-to-Text</th>
107
+ </tr>
108
+ <tr align="center">
109
+ <th>Setup</th><th colspan="3">Zero-shot</th><th colspan="3">Finetune</th><th colspan="3">Zero-shot</th><th colspan="3">Finetune</th>
110
+ </tr>
111
+ <tr align="center">
112
+ <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
113
+ </tr>
114
+ <tr align="center">
115
+ <td width="120%">Wukong</td><td>53.4</td><td>80.2</td><td>90.1</td><td>74.0</td><td>94.4</td><td>98.1</td><td>55.2</td><td>81.0</td><td>90.6</td><td>73.3</td><td>94.0</td><td>98.0</td>
116
+ </tr>
117
+ <tr align="center">
118
+ <td width="120%">R2D2</td><td>56.4</td><td>85.0</td><td>93.1</td><td>79.1</td><td>96.5</td><td>98.9</td><td>63.3</td><td>89.3</td><td>95.7</td><td>79.3</td><td>97.1</td><td>98.7</td>
119
+ </tr>
120
+ <tr align="center">
121
+ <td width="120%">CN-CLIP</td><td>69.2</td><td>89.9</td><td>96.1</td><td>81.5</td><td>96.9</td><td>99.1</td><td>63.0</td><td>86.6</td><td>92.9</td><td>83.5</td><td>97.3</td><td>99.2</td>
122
+ </tr>
123
+ </table>
124
+ <br>
125
+
126
+ **Zero-shot Image Classification**:
127
+ <table border="1" width="100%">
128
+ <tr align="center">
129
+ <th>Task</th><th>CIFAR10</th><th>CIFAR100</th><th>DTD</th><th>EuroSAT</th><th>FER</th><th>FGVC</th><th>KITTI</th><th>MNIST</th><th>PC</th><th>VOC</th>
130
+ </tr>
131
+ <tr align="center">
132
+ <td width="150%">GIT</td><td>88.5</td><td>61.1</td><td>42.9</td><td>43.4</td><td>41.4</td><td>6.7</td><td>22.1</td><td>68.9</td><td>50.0</td><td>80.2</td>
133
+ </tr>
134
+ <tr align="center">
135
+ <td width="150%">ALIGN</td><td>94.9</td><td>76.8</td><td>66.1</td><td>52.1</td><td>50.8</td><td>25.0</td><td>41.2</td><td>74.0</td><td>55.2</td><td>83.0</td>
136
+ </tr>
137
+ <tr align="center">
138
+ <td width="150%">CLIP</td><td>94.9</td><td>77.0</td><td>56.0</td><td>63.0</td><td>48.3</td><td>33.3</td><td>11.5</td><td>79.0</td><td>62.3</td><td>84.0</td>
139
+ </tr>
140
+ <tr align="center">
141
+ <td width="150%">Wukong</td><td>95.4</td><td>77.1</td><td>40.9</td><td>50.3</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td>
142
+ </tr>
143
+ <tr align="center">
144
+ <td width="150%">CN-CLIP</td><td>96.0</td><td>79.7</td><td>51.2</td><td>52.0</td><td>55.1</td><td>26.2</td><td>49.9</td><td>79.4</td><td>63.5</td><td>84.9</td>
145
+ </tr>
146
+ </table>
147
+ <br>
148
+
149
+ ## Citation
150
+ If you find Chinese CLIP helpful, feel free to cite our paper. Thanks for your support!
151
+
152
+ ```
153
+ @article{chinese-clip,
154
+ title={Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese},
155
+ author={Yang, An and Pan, Junshu and Lin, Junyang and Men, Rui and Zhang, Yichang and Zhou, Jingren and Zhou, Chang},
156
+ journal={arXiv preprint arXiv:2211.01335},
157
+ year={2022}
158
+ }
159
+ ```
160
+ <br>
chinese-clip-vit-large-patch14-336px/clip_cn_vit-l-14-336.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53b9a3141f14b2e87366b0857ce78ac1c4c99e028d4d3d832ca8107deae0775b
3
+ size 1626446358
chinese-clip-vit-large-patch14-336px/config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ChineseCLIPModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "logit_scale_init_value": 2.6592,
7
+ "model_type": "chinese_clip",
8
+ "projection_dim": 768,
9
+ "text_config": {
10
+ "architectures": [
11
+ "ChineseCLIPTextModel"
12
+ ],
13
+ "attention_probs_dropout_prob": 0.1,
14
+ "bos_token_id": 0,
15
+ "directionality": "bidi",
16
+ "eos_token_id": 2,
17
+ "hidden_act": "gelu",
18
+ "hidden_dropout_prob": 0.1,
19
+ "hidden_size": 768,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 3072,
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "chinese_clip_text_model",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "type_vocab_size": 2,
35
+ "vocab_size": 21128
36
+ },
37
+ "text_config_dict": null,
38
+ "torch_dtype": "float32",
39
+ "transformers_version": null,
40
+ "vision_config": {
41
+ "_name_or_path": "",
42
+ "add_cross_attention": false,
43
+ "architectures": null,
44
+ "attention_dropout": 0.0,
45
+ "bad_words_ids": null,
46
+ "bos_token_id": null,
47
+ "chunk_size_feed_forward": 0,
48
+ "cross_attention_hidden_size": null,
49
+ "decoder_start_token_id": null,
50
+ "diversity_penalty": 0.0,
51
+ "do_sample": false,
52
+ "dropout": 0.0,
53
+ "early_stopping": false,
54
+ "encoder_no_repeat_ngram_size": 0,
55
+ "eos_token_id": null,
56
+ "exponential_decay_length_penalty": null,
57
+ "finetuning_task": null,
58
+ "forced_bos_token_id": null,
59
+ "forced_eos_token_id": null,
60
+ "hidden_act": "quick_gelu",
61
+ "hidden_size": 1024,
62
+ "id2label": {
63
+ "0": "LABEL_0",
64
+ "1": "LABEL_1"
65
+ },
66
+ "image_size": 336,
67
+ "initializer_factor": 1.0,
68
+ "initializer_range": 0.02,
69
+ "intermediate_size": 4096,
70
+ "is_decoder": false,
71
+ "is_encoder_decoder": false,
72
+ "label2id": {
73
+ "LABEL_0": 0,
74
+ "LABEL_1": 1
75
+ },
76
+ "layer_norm_eps": 1e-05,
77
+ "length_penalty": 1.0,
78
+ "max_length": 20,
79
+ "min_length": 0,
80
+ "model_type": "clip_vision_model",
81
+ "no_repeat_ngram_size": 0,
82
+ "num_attention_heads": 16,
83
+ "num_beam_groups": 1,
84
+ "num_beams": 1,
85
+ "num_channels": 3,
86
+ "num_hidden_layers": 24,
87
+ "num_return_sequences": 1,
88
+ "output_attentions": false,
89
+ "output_hidden_states": false,
90
+ "output_scores": false,
91
+ "pad_token_id": null,
92
+ "patch_size": 14,
93
+ "prefix": null,
94
+ "problem_type": null,
95
+ "projection_dim": 768,
96
+ "pruned_heads": {},
97
+ "remove_invalid_values": false,
98
+ "repetition_penalty": 1.0,
99
+ "return_dict": true,
100
+ "return_dict_in_generate": false,
101
+ "sep_token_id": null,
102
+ "task_specific_params": null,
103
+ "temperature": 1.0,
104
+ "tf_legacy_loss": false,
105
+ "tie_encoder_decoder": false,
106
+ "tie_word_embeddings": true,
107
+ "tokenizer_class": null,
108
+ "top_k": 50,
109
+ "top_p": 1.0,
110
+ "torch_dtype": null,
111
+ "torchscript": false,
112
+ "transformers_version": "4.21.3",
113
+ "typical_p": 1.0,
114
+ "use_bfloat16": false
115
+ },
116
+ "vision_config_dict": {
117
+ "hidden_size": 1024,
118
+ "image_size": 336,
119
+ "intermediate_size": 4096,
120
+ "num_attention_heads": 16,
121
+ "num_hidden_layers": 24,
122
+ "patch_size": 14,
123
+ "projection_dim": 768
124
+ }
125
+ }
chinese-clip-vit-large-patch14-336px/preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_center_crop": false,
3
+ "do_normalize": true,
4
+ "do_resize": true,
5
+ "feature_extractor_type": "ChineseCLIPFeatureExtractor",
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_std": [
12
+ 0.26862954,
13
+ 0.26130258,
14
+ 0.27577711
15
+ ],
16
+ "resample": 3,
17
+ "size": {
18
+ "height": 336,
19
+ "width": 336
20
+ }
21
+ }
chinese-clip-vit-large-patch14-336px/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:973be725d497f7a01d3caebf9e79098d568889f42dfcc64832bbbbc5c963a70f
3
+ size 1626425823
chinese-clip-vit-large-patch14-336px/vocab.txt ADDED
The diff for this file is too large to render. See raw diff