mgoin commited on
Commit
2e58142
1 Parent(s): 942b654
README.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: zero-shot-classification
3
+ base_model: laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K
4
+ inference: false
5
+ tags:
6
+ - deepsparse
7
+ ---
8
+ This is an unoptimized, exported version of https://huggingface.co/laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K that is ready to use with [DeepSparse](https://github.com/neuralmagic/deepsparse).
9
+
10
+ Notebook for basic usage: [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ZvU9ZSHJKSeJyH5bgxo_A-GSVIUcSt2E?usp=sharing)
11
+ Notebook for Imagenette evaluation: [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-Duq0YNtjzOnmuXCYo-5DDiOzeCItXpN?usp=sharing)
12
+
13
+ ## Setup for usage
14
+ First, install DeepSparse with extensions for CLIP:
15
+ ```
16
+ pip install deepsparse-nightly[clip]>=1.7.0.20231210
17
+ ```
18
+
19
+ Download some test images of a church, a dog, and elephants:
20
+ ```
21
+ wget -O basilica.jpg https://raw.githubusercontent.com/neuralmagic/deepsparse/main/src/deepsparse/yolo/sample_images/basilica.jpg
22
+ wget -O buddy.jpeg https://raw.githubusercontent.com/neuralmagic/deepsparse/main/tests/deepsparse/pipelines/sample_images/buddy.jpeg
23
+ wget -O thailand.jpg https://raw.githubusercontent.com/neuralmagic/deepsparse/main/src/deepsparse/yolact/sample_images/thailand.jpg
24
+ ```
25
+
26
+ For this model there is a second input that is the length of tokens, so run this input override code before making a text pipeline:
27
+ ```python
28
+ import numpy as np
29
+ from deepsparse.clip import CLIPTextPipeline
30
+
31
+ def custom_process_inputs(self, inputs):
32
+ if not isinstance(inputs.text, list):
33
+ inputs.text = [inputs.text]
34
+ if not isinstance(inputs.text[0], str):
35
+ return inputs.text
36
+ tokens = [np.array(t).astype(np.int32) for t in self.tokenizer(inputs.text)]
37
+ tokens = np.stack(tokens, axis=0)
38
+ tokens_lengths = np.array(tokens.shape[0] * [tokens.shape[1] - 1])
39
+ return [tokens, tokens_lengths]
40
+
41
+ # This overrides the process_inputs function globally for all CLIPTextPipeline classes
42
+ CLIPTextPipeline.process_inputs = custom_process_inputs
43
+ ```
44
+
45
+ ## Text embedding pipeline
46
+
47
+ Here is an example of how to create and use a [DeepSparse pipeline for text embeddings](https://github.com/neuralmagic/deepsparse/blob/main/src/deepsparse/clip/text_pipeline.py).
48
+ ```python
49
+ from deepsparse import Pipeline
50
+ from huggingface_hub import snapshot_download
51
+
52
+ # Download the model from HF
53
+ model_folder = snapshot_download(repo_id="neuralmagic/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K-quant-ds")
54
+
55
+ text_embed_pipeline = Pipeline.create(task="clip_text", model_path=model_folder + "/textual.onnx")
56
+
57
+ text = ["ice cream", "an elephant", "a dog", "a building", "a church"]
58
+
59
+ embeddings = text_embed_pipeline(text=text).text_embeddings
60
+ for i in range(len(embeddings)):
61
+ print(embeddings[i].shape)
62
+ print(embeddings[i])
63
+ ```
64
+
65
+ ## Image embedding pipeline
66
+
67
+ Here is an example of how to create and use a [DeepSparse pipeline for image embeddings](https://github.com/neuralmagic/deepsparse/blob/main/src/deepsparse/clip/visual_pipeline.py).
68
+ ```python
69
+ from deepsparse import Pipeline
70
+ from huggingface_hub import snapshot_download
71
+
72
+ # Download the model from HF
73
+ model_folder = snapshot_download(repo_id="neuralmagic/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K-quant-ds")
74
+
75
+ image_embed_pipeline = Pipeline.create(task="clip_visual", model_path=model_folder + "/visual.onnx")
76
+
77
+ images = ["basilica.jpg", "buddy.jpeg", "thailand.jpg"]
78
+
79
+ embeddings = image_embed_pipeline(images=images).image_embeddings
80
+ for i in range(len(embeddings)):
81
+ print(embeddings[i].shape)
82
+ print(embeddings[i])
83
+ ```
84
+
85
+ ## Zero-shot image classification pipeline
86
+
87
+ Since CLIP trained both the text and image embedding models in tandem, we can generate embeddings for both and relate them together without retraining. Here is an example of how to create and use a [DeepSparse pipeline for zero-shot image classification](https://github.com/neuralmagic/deepsparse/blob/main/src/deepsparse/clip/zeroshot_pipeline.py).
88
+ ```python
89
+ from deepsparse import Pipeline
90
+ from deepsparse.clip import (
91
+ CLIPTextInput,
92
+ CLIPVisualInput,
93
+ CLIPZeroShotInput
94
+ )
95
+ from huggingface_hub import snapshot_download
96
+
97
+ # Download the model from HF
98
+ model_folder = snapshot_download(repo_id="neuralmagic/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K-quant-ds")
99
+
100
+ possible_classes = ["ice cream", "an elephant", "a dog", "a building", "a church"]
101
+ images = ["basilica.jpg", "buddy.jpeg", "thailand.jpg"]
102
+
103
+ # Load the model into DeepSparse
104
+ pipeline = Pipeline.create(
105
+ task="clip_zeroshot",
106
+ visual_model_path=model_folder + "/visual.onnx",
107
+ text_model_path=model_folder + "/textual.onnx"
108
+ )
109
+
110
+ # Infer
111
+ output = pipeline(
112
+ image=CLIPVisualInput(images=images),
113
+ text=CLIPTextInput(text=possible_classes),
114
+ ).text_scores
115
+
116
+ for i in range(len(output)):
117
+ prediction = possible_classes[np.argmax(output[i])]
118
+ print(f"Image {images[i]} is a picture of {prediction}")
119
+
120
+ """
121
+ Image basilica.jpg is a picture of a church
122
+ Image buddy.jpeg is a picture of a dog
123
+ Image thailand.jpg is a picture of an elephant
124
+ """
125
+ ```
open_clip_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_cfg": {
3
+ "embed_dim": 512,
4
+ "vision_cfg": {
5
+ "image_size": 256,
6
+ "layers": 12,
7
+ "width": 768,
8
+ "patch_size": 32
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 512,
14
+ "heads": 8,
15
+ "layers": 12
16
+ }
17
+ },
18
+ "preprocess_cfg": {
19
+ "mean": [
20
+ 0.48145466,
21
+ 0.4578275,
22
+ 0.40821073
23
+ ],
24
+ "std": [
25
+ 0.26862954,
26
+ 0.26130258,
27
+ 0.27577711
28
+ ]
29
+ }
30
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
textual.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61613f61c35db031c32d6c9dae562aec8fdbbd8b33552a437fc9ffc19ac593c9
3
+ size 254100057
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "clean_up_tokenization_spaces": true,
12
+ "do_lower_case": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "model_max_length": 77,
23
+ "pad_token": "<|endoftext|>",
24
+ "tokenizer_class": "CLIPTokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
visual.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16a548893da9333bde0d17c8b516c1f1f53ea64a06ca0ecce5aeae08dd8648af
3
+ size 351499921