Upload folder using huggingface_hub
Browse files- chinese-clip-vit-large-patch14-336px/.gitattributes +34 -0
- chinese-clip-vit-large-patch14-336px/README.md +160 -0
- chinese-clip-vit-large-patch14-336px/clip_cn_vit-l-14-336.pt +3 -0
- chinese-clip-vit-large-patch14-336px/config.json +125 -0
- chinese-clip-vit-large-patch14-336px/preprocessor_config.json +21 -0
- chinese-clip-vit-large-patch14-336px/pytorch_model.bin +3 -0
- chinese-clip-vit-large-patch14-336px/vocab.txt +0 -0
chinese-clip-vit-large-patch14-336px/.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
chinese-clip-vit-large-patch14-336px/README.md
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- vision
|
4 |
+
widget:
|
5 |
+
- src: https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/resolve/main/festival.jpg
|
6 |
+
candidate_labels: 灯笼, 鞭炮, 对联
|
7 |
+
example_title: festival
|
8 |
+
- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
|
9 |
+
candidate_labels: 音乐表演, 体育运动
|
10 |
+
example_title: cat & dog
|
11 |
+
- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
|
12 |
+
candidate_labels: 梅西, C罗, 马奎尔
|
13 |
+
example_title: football
|
14 |
+
---
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
# Chinese-CLIP-ViT-Large-Patch14-336px
|
19 |
+
|
20 |
+
## Introduction
|
21 |
+
This is the large-version of the Chinese CLIP, with ViT-L/14@336px as the image encoder and RoBERTa-wwm-base as the text encoder. Chinese CLIP is a simple implementation of CLIP on a large-scale dataset of around 200 million Chinese image-text pairs. For more details, please refer to our technical report https://arxiv.org/abs/2211.01335 and our official github repo https://github.com/OFA-Sys/Chinese-CLIP (Welcome to star! 🔥🔥)
|
22 |
+
|
23 |
+
## Use with the official API
|
24 |
+
We provide a simple code snippet to show how to use the API of Chinese-CLIP to compute the image & text embeddings and similarities.
|
25 |
+
|
26 |
+
```python
|
27 |
+
from PIL import Image
|
28 |
+
import requests
|
29 |
+
from transformers import ChineseCLIPProcessor, ChineseCLIPModel
|
30 |
+
|
31 |
+
model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-large-patch14-336px")
|
32 |
+
processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-large-patch14-336px")
|
33 |
+
|
34 |
+
url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
|
35 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
36 |
+
# Squirtle, Bulbasaur, Charmander, Pikachu in English
|
37 |
+
texts = ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]
|
38 |
+
|
39 |
+
# compute image feature
|
40 |
+
inputs = processor(images=image, return_tensors="pt")
|
41 |
+
image_features = model.get_image_features(**inputs)
|
42 |
+
image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) # normalize
|
43 |
+
|
44 |
+
# compute text features
|
45 |
+
inputs = processor(text=texts, padding=True, return_tensors="pt")
|
46 |
+
text_features = model.get_text_features(**inputs)
|
47 |
+
text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) # normalize
|
48 |
+
|
49 |
+
# compute image-text similarity scores
|
50 |
+
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
|
51 |
+
outputs = model(**inputs)
|
52 |
+
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
53 |
+
probs = logits_per_image.softmax(dim=1) # probs: [[0.0219, 0.0316, 0.0043, 0.9423]]
|
54 |
+
```
|
55 |
+
|
56 |
+
However, if you are not satisfied with only using the API, feel free to check our github repo https://github.com/OFA-Sys/Chinese-CLIP for more details about training and inference.
|
57 |
+
<br><br>
|
58 |
+
|
59 |
+
## Results
|
60 |
+
**MUGE Text-to-Image Retrieval**:
|
61 |
+
<table border="1" width="100%">
|
62 |
+
<tr align="center">
|
63 |
+
<th>Setup</th><th colspan="4">Zero-shot</th><th colspan="4">Finetune</th>
|
64 |
+
</tr>
|
65 |
+
<tr align="center">
|
66 |
+
<td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>MR</td><td>R@1</td><td>R@5</td><td>R@10</td><td>MR</td>
|
67 |
+
</tr>
|
68 |
+
<tr align="center">
|
69 |
+
<td width="120%">Wukong</td><td>42.7</td><td>69.0</td><td>78.0</td><td>63.2</td><td>52.7</td><td>77.9</td><td>85.6</td><td>72.1</td>
|
70 |
+
</tr>
|
71 |
+
<tr align="center">
|
72 |
+
<td width="120%">R2D2</td><td>49.5</td><td>75.7</td><td>83.2</td><td>69.5</td><td>60.1</td><td>82.9</td><td>89.4</td><td>77.5</td>
|
73 |
+
</tr>
|
74 |
+
<tr align="center">
|
75 |
+
<td width="120%">CN-CLIP</td><td>63.0</td><td>84.1</td><td>89.2</td><td>78.8</td><td>68.9</td><td>88.7</td><td>93.1</td><td>83.6</td>
|
76 |
+
</tr>
|
77 |
+
</table>
|
78 |
+
<br>
|
79 |
+
|
80 |
+
**Flickr30K-CN Retrieval**:
|
81 |
+
<table border="1" width="120%">
|
82 |
+
<tr align="center">
|
83 |
+
<th>Task</th><th colspan="6">Text-to-Image</th><th colspan="6">Image-to-Text</th>
|
84 |
+
</tr>
|
85 |
+
<tr align="center">
|
86 |
+
<th>Setup</th><th colspan="3">Zero-shot</th><th colspan="3">Finetune</th><th colspan="3">Zero-shot</th><th colspan="3">Finetune</th>
|
87 |
+
</tr>
|
88 |
+
<tr align="center">
|
89 |
+
<td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
|
90 |
+
</tr>
|
91 |
+
<tr align="center">
|
92 |
+
<td width="120%">Wukong</td><td>51.7</td><td>78.9</td><td>86.3</td><td>77.4</td><td>94.5</td><td>97.0</td><td>76.1</td><td>94.8</td><td>97.5</td><td>92.7</td><td>99.1</td><td>99.6</td>
|
93 |
+
</tr>
|
94 |
+
<tr align="center">
|
95 |
+
<td width="120%">R2D2</td><td>60.9</td><td>86.8</td><td>92.7</td><td>84.4</td><td>96.7</td><td>98.4</td><td>77.6</td><td>96.7</td><td>98.9</td><td>95.6</td><td>99.8</td><td>100.0</td>
|
96 |
+
</tr>
|
97 |
+
<tr align="center">
|
98 |
+
<td width="120%">CN-CLIP</td><td>71.2</td><td>91.4</td><td>95.5</td><td>83.8</td><td>96.9</td><td>98.6</td><td>81.6</td><td>97.5</td><td>98.8</td><td>95.3</td><td>99.7</td><td>100.0</td>
|
99 |
+
</tr>
|
100 |
+
</table>
|
101 |
+
<br>
|
102 |
+
|
103 |
+
**COCO-CN Retrieval**:
|
104 |
+
<table border="1" width="100%">
|
105 |
+
<tr align="center">
|
106 |
+
<th>Task</th><th colspan="6">Text-to-Image</th><th colspan="6">Image-to-Text</th>
|
107 |
+
</tr>
|
108 |
+
<tr align="center">
|
109 |
+
<th>Setup</th><th colspan="3">Zero-shot</th><th colspan="3">Finetune</th><th colspan="3">Zero-shot</th><th colspan="3">Finetune</th>
|
110 |
+
</tr>
|
111 |
+
<tr align="center">
|
112 |
+
<td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
|
113 |
+
</tr>
|
114 |
+
<tr align="center">
|
115 |
+
<td width="120%">Wukong</td><td>53.4</td><td>80.2</td><td>90.1</td><td>74.0</td><td>94.4</td><td>98.1</td><td>55.2</td><td>81.0</td><td>90.6</td><td>73.3</td><td>94.0</td><td>98.0</td>
|
116 |
+
</tr>
|
117 |
+
<tr align="center">
|
118 |
+
<td width="120%">R2D2</td><td>56.4</td><td>85.0</td><td>93.1</td><td>79.1</td><td>96.5</td><td>98.9</td><td>63.3</td><td>89.3</td><td>95.7</td><td>79.3</td><td>97.1</td><td>98.7</td>
|
119 |
+
</tr>
|
120 |
+
<tr align="center">
|
121 |
+
<td width="120%">CN-CLIP</td><td>69.2</td><td>89.9</td><td>96.1</td><td>81.5</td><td>96.9</td><td>99.1</td><td>63.0</td><td>86.6</td><td>92.9</td><td>83.5</td><td>97.3</td><td>99.2</td>
|
122 |
+
</tr>
|
123 |
+
</table>
|
124 |
+
<br>
|
125 |
+
|
126 |
+
**Zero-shot Image Classification**:
|
127 |
+
<table border="1" width="100%">
|
128 |
+
<tr align="center">
|
129 |
+
<th>Task</th><th>CIFAR10</th><th>CIFAR100</th><th>DTD</th><th>EuroSAT</th><th>FER</th><th>FGVC</th><th>KITTI</th><th>MNIST</th><th>PC</th><th>VOC</th>
|
130 |
+
</tr>
|
131 |
+
<tr align="center">
|
132 |
+
<td width="150%">GIT</td><td>88.5</td><td>61.1</td><td>42.9</td><td>43.4</td><td>41.4</td><td>6.7</td><td>22.1</td><td>68.9</td><td>50.0</td><td>80.2</td>
|
133 |
+
</tr>
|
134 |
+
<tr align="center">
|
135 |
+
<td width="150%">ALIGN</td><td>94.9</td><td>76.8</td><td>66.1</td><td>52.1</td><td>50.8</td><td>25.0</td><td>41.2</td><td>74.0</td><td>55.2</td><td>83.0</td>
|
136 |
+
</tr>
|
137 |
+
<tr align="center">
|
138 |
+
<td width="150%">CLIP</td><td>94.9</td><td>77.0</td><td>56.0</td><td>63.0</td><td>48.3</td><td>33.3</td><td>11.5</td><td>79.0</td><td>62.3</td><td>84.0</td>
|
139 |
+
</tr>
|
140 |
+
<tr align="center">
|
141 |
+
<td width="150%">Wukong</td><td>95.4</td><td>77.1</td><td>40.9</td><td>50.3</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td>
|
142 |
+
</tr>
|
143 |
+
<tr align="center">
|
144 |
+
<td width="150%">CN-CLIP</td><td>96.0</td><td>79.7</td><td>51.2</td><td>52.0</td><td>55.1</td><td>26.2</td><td>49.9</td><td>79.4</td><td>63.5</td><td>84.9</td>
|
145 |
+
</tr>
|
146 |
+
</table>
|
147 |
+
<br>
|
148 |
+
|
149 |
+
## Citation
|
150 |
+
If you find Chinese CLIP helpful, feel free to cite our paper. Thanks for your support!
|
151 |
+
|
152 |
+
```
|
153 |
+
@article{chinese-clip,
|
154 |
+
title={Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese},
|
155 |
+
author={Yang, An and Pan, Junshu and Lin, Junyang and Men, Rui and Zhang, Yichang and Zhou, Jingren and Zhou, Chang},
|
156 |
+
journal={arXiv preprint arXiv:2211.01335},
|
157 |
+
year={2022}
|
158 |
+
}
|
159 |
+
```
|
160 |
+
<br>
|
chinese-clip-vit-large-patch14-336px/clip_cn_vit-l-14-336.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53b9a3141f14b2e87366b0857ce78ac1c4c99e028d4d3d832ca8107deae0775b
|
3 |
+
size 1626446358
|
chinese-clip-vit-large-patch14-336px/config.json
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"ChineseCLIPModel"
|
4 |
+
],
|
5 |
+
"initializer_factor": 1.0,
|
6 |
+
"logit_scale_init_value": 2.6592,
|
7 |
+
"model_type": "chinese_clip",
|
8 |
+
"projection_dim": 768,
|
9 |
+
"text_config": {
|
10 |
+
"architectures": [
|
11 |
+
"ChineseCLIPTextModel"
|
12 |
+
],
|
13 |
+
"attention_probs_dropout_prob": 0.1,
|
14 |
+
"bos_token_id": 0,
|
15 |
+
"directionality": "bidi",
|
16 |
+
"eos_token_id": 2,
|
17 |
+
"hidden_act": "gelu",
|
18 |
+
"hidden_dropout_prob": 0.1,
|
19 |
+
"hidden_size": 768,
|
20 |
+
"initializer_range": 0.02,
|
21 |
+
"intermediate_size": 3072,
|
22 |
+
"layer_norm_eps": 1e-12,
|
23 |
+
"max_position_embeddings": 512,
|
24 |
+
"model_type": "chinese_clip_text_model",
|
25 |
+
"num_attention_heads": 12,
|
26 |
+
"num_hidden_layers": 12,
|
27 |
+
"output_past": true,
|
28 |
+
"pad_token_id": 0,
|
29 |
+
"pooler_fc_size": 768,
|
30 |
+
"pooler_num_attention_heads": 12,
|
31 |
+
"pooler_num_fc_layers": 3,
|
32 |
+
"pooler_size_per_head": 128,
|
33 |
+
"pooler_type": "first_token_transform",
|
34 |
+
"type_vocab_size": 2,
|
35 |
+
"vocab_size": 21128
|
36 |
+
},
|
37 |
+
"text_config_dict": null,
|
38 |
+
"torch_dtype": "float32",
|
39 |
+
"transformers_version": null,
|
40 |
+
"vision_config": {
|
41 |
+
"_name_or_path": "",
|
42 |
+
"add_cross_attention": false,
|
43 |
+
"architectures": null,
|
44 |
+
"attention_dropout": 0.0,
|
45 |
+
"bad_words_ids": null,
|
46 |
+
"bos_token_id": null,
|
47 |
+
"chunk_size_feed_forward": 0,
|
48 |
+
"cross_attention_hidden_size": null,
|
49 |
+
"decoder_start_token_id": null,
|
50 |
+
"diversity_penalty": 0.0,
|
51 |
+
"do_sample": false,
|
52 |
+
"dropout": 0.0,
|
53 |
+
"early_stopping": false,
|
54 |
+
"encoder_no_repeat_ngram_size": 0,
|
55 |
+
"eos_token_id": null,
|
56 |
+
"exponential_decay_length_penalty": null,
|
57 |
+
"finetuning_task": null,
|
58 |
+
"forced_bos_token_id": null,
|
59 |
+
"forced_eos_token_id": null,
|
60 |
+
"hidden_act": "quick_gelu",
|
61 |
+
"hidden_size": 1024,
|
62 |
+
"id2label": {
|
63 |
+
"0": "LABEL_0",
|
64 |
+
"1": "LABEL_1"
|
65 |
+
},
|
66 |
+
"image_size": 336,
|
67 |
+
"initializer_factor": 1.0,
|
68 |
+
"initializer_range": 0.02,
|
69 |
+
"intermediate_size": 4096,
|
70 |
+
"is_decoder": false,
|
71 |
+
"is_encoder_decoder": false,
|
72 |
+
"label2id": {
|
73 |
+
"LABEL_0": 0,
|
74 |
+
"LABEL_1": 1
|
75 |
+
},
|
76 |
+
"layer_norm_eps": 1e-05,
|
77 |
+
"length_penalty": 1.0,
|
78 |
+
"max_length": 20,
|
79 |
+
"min_length": 0,
|
80 |
+
"model_type": "clip_vision_model",
|
81 |
+
"no_repeat_ngram_size": 0,
|
82 |
+
"num_attention_heads": 16,
|
83 |
+
"num_beam_groups": 1,
|
84 |
+
"num_beams": 1,
|
85 |
+
"num_channels": 3,
|
86 |
+
"num_hidden_layers": 24,
|
87 |
+
"num_return_sequences": 1,
|
88 |
+
"output_attentions": false,
|
89 |
+
"output_hidden_states": false,
|
90 |
+
"output_scores": false,
|
91 |
+
"pad_token_id": null,
|
92 |
+
"patch_size": 14,
|
93 |
+
"prefix": null,
|
94 |
+
"problem_type": null,
|
95 |
+
"projection_dim": 768,
|
96 |
+
"pruned_heads": {},
|
97 |
+
"remove_invalid_values": false,
|
98 |
+
"repetition_penalty": 1.0,
|
99 |
+
"return_dict": true,
|
100 |
+
"return_dict_in_generate": false,
|
101 |
+
"sep_token_id": null,
|
102 |
+
"task_specific_params": null,
|
103 |
+
"temperature": 1.0,
|
104 |
+
"tf_legacy_loss": false,
|
105 |
+
"tie_encoder_decoder": false,
|
106 |
+
"tie_word_embeddings": true,
|
107 |
+
"tokenizer_class": null,
|
108 |
+
"top_k": 50,
|
109 |
+
"top_p": 1.0,
|
110 |
+
"torch_dtype": null,
|
111 |
+
"torchscript": false,
|
112 |
+
"transformers_version": "4.21.3",
|
113 |
+
"typical_p": 1.0,
|
114 |
+
"use_bfloat16": false
|
115 |
+
},
|
116 |
+
"vision_config_dict": {
|
117 |
+
"hidden_size": 1024,
|
118 |
+
"image_size": 336,
|
119 |
+
"intermediate_size": 4096,
|
120 |
+
"num_attention_heads": 16,
|
121 |
+
"num_hidden_layers": 24,
|
122 |
+
"patch_size": 14,
|
123 |
+
"projection_dim": 768
|
124 |
+
}
|
125 |
+
}
|
chinese-clip-vit-large-patch14-336px/preprocessor_config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_center_crop": false,
|
3 |
+
"do_normalize": true,
|
4 |
+
"do_resize": true,
|
5 |
+
"feature_extractor_type": "ChineseCLIPFeatureExtractor",
|
6 |
+
"image_mean": [
|
7 |
+
0.48145466,
|
8 |
+
0.4578275,
|
9 |
+
0.40821073
|
10 |
+
],
|
11 |
+
"image_std": [
|
12 |
+
0.26862954,
|
13 |
+
0.26130258,
|
14 |
+
0.27577711
|
15 |
+
],
|
16 |
+
"resample": 3,
|
17 |
+
"size": {
|
18 |
+
"height": 336,
|
19 |
+
"width": 336
|
20 |
+
}
|
21 |
+
}
|
chinese-clip-vit-large-patch14-336px/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:973be725d497f7a01d3caebf9e79098d568889f42dfcc64832bbbbc5c963a70f
|
3 |
+
size 1626425823
|
chinese-clip-vit-large-patch14-336px/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|