Félix Marty commited on
Commit
e5efaae
1 Parent(s): 6cc6e24
README.md CHANGED
@@ -1,3 +1,41 @@
1
  ---
2
  license: mit
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ datasets:
4
+ - sst2
5
+ - glue
6
+ language: en
7
  ---
8
+
9
+ This model is a fork of [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) quantized with [Optimum library 🤗](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#use-tensorrt-execution-provider-with-quantized-models) using static quantization.
10
+
11
+ This model can be used as follow:
12
+
13
+ ```python
14
+ import onnxruntime
15
+ from transformers import AutoTokenizer
16
+ from optimum.onnxruntime import ORTModelForSequenceClassification
17
+
18
+ session_options = onnxruntime.SessionOptions()
19
+ session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained("fxmarty/distilbert-base-uncased-sst2-onnx-int8-for-tensorrt")
22
+ ort_model = ORTModelForSequenceClassification.from_pretrained(
23
+ "fxmarty/distilbert-base-uncased-sst2-onnx-int8-for-tensorrt",
24
+ provider="TensorrtExecutionProvider",
25
+ session_options=session_options,
26
+ provider_options={"trt_int8_enable": True},
27
+ )
28
+
29
+ inp = tokenizer("TensorRT is a bit painful to use, but at the end of day it runs smoothly and blazingly fast!", return_tensors="np")
30
+
31
+ res = ort_model(**inp)
32
+
33
+ print(res)
34
+ print(ort_model.config.id2label[res.logits[0].argmax()])
35
+ # SequenceClassifierOutput(loss=None, logits=array([[-0.545066 , 0.5609764]], dtype=float32), hidden_states=None, attentions=None)
36
+ # POSITIVE
37
+ ```
38
+
39
+ Inspecting the graph (for example [here with netron]()), we see that it contains Quantize and Dequantize nodes, that will be interpreted by TensorRT to run in INT8:
40
+
41
+ ![QDQ](qdq_nodes.png)
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/tmp/tmp1mjenugf",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "finetuning_task": "sst-2",
11
+ "hidden_dim": 3072,
12
+ "id2label": {
13
+ "0": "NEGATIVE",
14
+ "1": "POSITIVE"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "label2id": {
18
+ "NEGATIVE": 0,
19
+ "POSITIVE": 1
20
+ },
21
+ "max_position_embeddings": 512,
22
+ "model_type": "distilbert",
23
+ "n_heads": 12,
24
+ "n_layers": 6,
25
+ "output_past": true,
26
+ "pad_token_id": 0,
27
+ "qa_dropout": 0.1,
28
+ "seq_classif_dropout": 0.2,
29
+ "sinusoidal_pos_embds": false,
30
+ "tie_weights_": true,
31
+ "transformers_version": "4.27.0.dev0",
32
+ "vocab_size": 30522
33
+ }
eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "accuracy": 0.8876146788990825
3
+ }
model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477d0d0c102ed8529620664cc817b927869eddf286c9f78a85f28164300ed101
3
+ size 268183103
ort_config.json ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "one_external_file": true,
3
+ "opset": null,
4
+ "optimization": {},
5
+ "optimum_version": "1.6.5.dev0",
6
+ "quantization": {
7
+ "activations_dtype": "QInt8",
8
+ "activations_symmetric": true,
9
+ "format": "QDQ",
10
+ "is_static": true,
11
+ "mode": "QLinearOps",
12
+ "nodes_to_exclude": [
13
+ "/distilbert/transformer/layer.4/sa_layer_norm/ReduceMean_1",
14
+ "/distilbert/transformer/layer.1/ffn/activation/Div",
15
+ "/distilbert/transformer/layer.0/sa_layer_norm/Div",
16
+ "/distilbert/transformer/layer.2/Add_1",
17
+ "/distilbert/transformer/layer.1/ffn/activation/Erf",
18
+ "/distilbert/transformer/layer.5/sa_layer_norm/ReduceMean_1",
19
+ "/distilbert/transformer/layer.0/output_layer_norm/ReduceMean",
20
+ "/distilbert/transformer/layer.3/output_layer_norm/Sqrt",
21
+ "/distilbert/transformer/layer.0/ffn/activation/Add",
22
+ "/distilbert/transformer/layer.1/output_layer_norm/ReduceMean_1",
23
+ "/distilbert/transformer/layer.5/sa_layer_norm/Add_1",
24
+ "/distilbert/transformer/layer.2/ffn/activation/Erf",
25
+ "/distilbert/transformer/layer.4/ffn/activation/Mul_1",
26
+ "/distilbert/transformer/layer.1/sa_layer_norm/Add_1",
27
+ "/distilbert/transformer/layer.4/Add",
28
+ "/distilbert/transformer/layer.2/ffn/activation/Add",
29
+ "/distilbert/transformer/layer.2/ffn/activation/Mul_1",
30
+ "/distilbert/transformer/layer.1/output_layer_norm/Pow",
31
+ "/distilbert/transformer/layer.4/output_layer_norm/Mul",
32
+ "/distilbert/embeddings/LayerNorm/Div",
33
+ "/distilbert/transformer/layer.3/output_layer_norm/Pow",
34
+ "/distilbert/transformer/layer.0/Add",
35
+ "/distilbert/transformer/layer.1/output_layer_norm/Div",
36
+ "/distilbert/embeddings/LayerNorm/Sub",
37
+ "/distilbert/transformer/layer.1/sa_layer_norm/Add",
38
+ "/distilbert/transformer/layer.5/ffn/activation/Mul_1",
39
+ "/distilbert/transformer/layer.5/sa_layer_norm/Add",
40
+ "/distilbert/transformer/layer.2/sa_layer_norm/Mul",
41
+ "/distilbert/embeddings/LayerNorm/Add_1",
42
+ "/distilbert/transformer/layer.3/ffn/activation/Div",
43
+ "/distilbert/transformer/layer.1/Add_1",
44
+ "/distilbert/transformer/layer.4/ffn/activation/Mul",
45
+ "/distilbert/transformer/layer.5/sa_layer_norm/Div",
46
+ "/distilbert/transformer/layer.0/sa_layer_norm/Sub",
47
+ "/distilbert/transformer/layer.0/sa_layer_norm/Pow",
48
+ "/distilbert/transformer/layer.2/sa_layer_norm/ReduceMean",
49
+ "/distilbert/transformer/layer.3/output_layer_norm/Add_1",
50
+ "/distilbert/transformer/layer.1/output_layer_norm/Mul",
51
+ "/distilbert/transformer/layer.3/sa_layer_norm/Sub",
52
+ "/distilbert/transformer/layer.3/sa_layer_norm/Div",
53
+ "/distilbert/embeddings/LayerNorm/Pow",
54
+ "/distilbert/transformer/layer.5/sa_layer_norm/Sqrt",
55
+ "/distilbert/transformer/layer.1/output_layer_norm/ReduceMean",
56
+ "/distilbert/transformer/layer.4/output_layer_norm/Add",
57
+ "/distilbert/transformer/layer.2/output_layer_norm/Div",
58
+ "/distilbert/transformer/layer.5/ffn/activation/Mul",
59
+ "/distilbert/transformer/layer.0/ffn/activation/Mul",
60
+ "/distilbert/transformer/layer.3/ffn/activation/Mul",
61
+ "/distilbert/transformer/layer.3/output_layer_norm/ReduceMean",
62
+ "/distilbert/transformer/layer.3/sa_layer_norm/Mul",
63
+ "/distilbert/transformer/layer.2/ffn/activation/Mul",
64
+ "/distilbert/transformer/layer.5/Add",
65
+ "/distilbert/transformer/layer.0/output_layer_norm/Div",
66
+ "/distilbert/transformer/layer.4/ffn/activation/Div",
67
+ "/distilbert/transformer/layer.5/sa_layer_norm/Sub",
68
+ "/distilbert/transformer/layer.5/ffn/activation/Erf",
69
+ "/distilbert/embeddings/LayerNorm/Add",
70
+ "/distilbert/transformer/layer.4/Add_1",
71
+ "/distilbert/transformer/layer.1/sa_layer_norm/Sub",
72
+ "/distilbert/transformer/layer.4/output_layer_norm/Pow",
73
+ "/distilbert/transformer/layer.3/Add_1",
74
+ "/distilbert/transformer/layer.5/output_layer_norm/Add",
75
+ "/distilbert/transformer/layer.0/output_layer_norm/Mul",
76
+ "/distilbert/transformer/layer.4/output_layer_norm/Sqrt",
77
+ "/distilbert/transformer/layer.1/ffn/activation/Mul_1",
78
+ "/distilbert/transformer/layer.4/sa_layer_norm/Sqrt",
79
+ "/distilbert/embeddings/LayerNorm/ReduceMean",
80
+ "/distilbert/transformer/layer.4/output_layer_norm/ReduceMean_1",
81
+ "/distilbert/embeddings/LayerNorm/Sqrt",
82
+ "/distilbert/transformer/layer.2/sa_layer_norm/Sub",
83
+ "/distilbert/transformer/layer.5/sa_layer_norm/Pow",
84
+ "/distilbert/transformer/layer.4/ffn/activation/Erf",
85
+ "/distilbert/transformer/layer.0/output_layer_norm/Sqrt",
86
+ "/distilbert/transformer/layer.1/sa_layer_norm/Sqrt",
87
+ "/distilbert/transformer/layer.3/Add",
88
+ "/distilbert/transformer/layer.0/output_layer_norm/Sub",
89
+ "/distilbert/transformer/layer.0/output_layer_norm/Pow",
90
+ "/distilbert/transformer/layer.2/Add",
91
+ "/distilbert/transformer/layer.1/sa_layer_norm/Div",
92
+ "/distilbert/transformer/layer.3/sa_layer_norm/Sqrt",
93
+ "/distilbert/transformer/layer.0/ffn/activation/Erf",
94
+ "/distilbert/transformer/layer.1/sa_layer_norm/ReduceMean",
95
+ "/distilbert/transformer/layer.2/ffn/activation/Div",
96
+ "/distilbert/transformer/layer.3/sa_layer_norm/Add",
97
+ "/distilbert/transformer/layer.4/sa_layer_norm/Add",
98
+ "/distilbert/transformer/layer.4/sa_layer_norm/Add_1",
99
+ "/distilbert/transformer/layer.2/output_layer_norm/ReduceMean_1",
100
+ "/distilbert/transformer/layer.2/sa_layer_norm/ReduceMean_1",
101
+ "/distilbert/transformer/layer.5/output_layer_norm/Sub",
102
+ "/distilbert/transformer/layer.4/output_layer_norm/Sub",
103
+ "/distilbert/transformer/layer.3/output_layer_norm/Sub",
104
+ "/distilbert/transformer/layer.0/output_layer_norm/Add",
105
+ "/distilbert/transformer/layer.2/output_layer_norm/Add",
106
+ "/distilbert/transformer/layer.3/output_layer_norm/Mul",
107
+ "/distilbert/transformer/layer.5/output_layer_norm/Div",
108
+ "/distilbert/transformer/layer.2/sa_layer_norm/Add",
109
+ "/distilbert/transformer/layer.2/sa_layer_norm/Pow",
110
+ "/distilbert/transformer/layer.3/sa_layer_norm/ReduceMean",
111
+ "/distilbert/transformer/layer.1/output_layer_norm/Sqrt",
112
+ "/distilbert/transformer/layer.3/output_layer_norm/ReduceMean_1",
113
+ "/distilbert/transformer/layer.0/ffn/activation/Div",
114
+ "/distilbert/transformer/layer.3/ffn/activation/Add",
115
+ "/distilbert/embeddings/LayerNorm/ReduceMean_1",
116
+ "/distilbert/transformer/layer.3/sa_layer_norm/Add_1",
117
+ "/distilbert/transformer/layer.1/sa_layer_norm/Pow",
118
+ "/distilbert/transformer/layer.0/sa_layer_norm/ReduceMean",
119
+ "/distilbert/transformer/layer.0/sa_layer_norm/Add_1",
120
+ "/distilbert/transformer/layer.2/output_layer_norm/Sqrt",
121
+ "/distilbert/transformer/layer.4/ffn/activation/Add",
122
+ "/distilbert/transformer/layer.5/output_layer_norm/ReduceMean",
123
+ "/distilbert/transformer/layer.3/sa_layer_norm/Pow",
124
+ "/distilbert/transformer/layer.0/output_layer_norm/ReduceMean_1",
125
+ "/distilbert/transformer/layer.5/ffn/activation/Div",
126
+ "/distilbert/transformer/layer.5/output_layer_norm/Pow",
127
+ "/distilbert/transformer/layer.3/ffn/activation/Mul_1",
128
+ "/distilbert/transformer/layer.2/output_layer_norm/Mul",
129
+ "/distilbert/transformer/layer.5/ffn/activation/Add",
130
+ "/distilbert/transformer/layer.2/output_layer_norm/ReduceMean",
131
+ "/distilbert/transformer/layer.2/output_layer_norm/Pow",
132
+ "/distilbert/transformer/layer.4/sa_layer_norm/Pow",
133
+ "/distilbert/transformer/layer.5/output_layer_norm/Mul",
134
+ "/distilbert/transformer/layer.1/Add",
135
+ "/distilbert/transformer/layer.0/sa_layer_norm/Sqrt",
136
+ "/distilbert/transformer/layer.3/output_layer_norm/Div",
137
+ "/distilbert/transformer/layer.1/ffn/activation/Add",
138
+ "/distilbert/transformer/layer.1/sa_layer_norm/Mul",
139
+ "/distilbert/transformer/layer.1/ffn/activation/Mul",
140
+ "/distilbert/transformer/layer.0/sa_layer_norm/ReduceMean_1",
141
+ "/distilbert/transformer/layer.4/sa_layer_norm/Div",
142
+ "/distilbert/transformer/layer.4/output_layer_norm/ReduceMean",
143
+ "/distilbert/transformer/layer.2/sa_layer_norm/Div",
144
+ "/distilbert/transformer/layer.0/Add_1",
145
+ "/distilbert/transformer/layer.5/sa_layer_norm/ReduceMean",
146
+ "/distilbert/transformer/layer.0/sa_layer_norm/Mul",
147
+ "/distilbert/transformer/layer.2/output_layer_norm/Sub",
148
+ "/distilbert/transformer/layer.1/sa_layer_norm/ReduceMean_1",
149
+ "/distilbert/embeddings/LayerNorm/Mul",
150
+ "/distilbert/transformer/layer.3/output_layer_norm/Add",
151
+ "/distilbert/transformer/layer.0/sa_layer_norm/Add",
152
+ "/distilbert/transformer/layer.1/output_layer_norm/Add_1",
153
+ "/distilbert/transformer/layer.5/Add_1",
154
+ "/distilbert/transformer/layer.0/output_layer_norm/Add_1",
155
+ "/distilbert/transformer/layer.5/output_layer_norm/ReduceMean_1",
156
+ "/distilbert/transformer/layer.2/sa_layer_norm/Add_1",
157
+ "/distilbert/transformer/layer.0/ffn/activation/Mul_1",
158
+ "/distilbert/transformer/layer.4/sa_layer_norm/ReduceMean",
159
+ "/distilbert/transformer/layer.3/sa_layer_norm/ReduceMean_1",
160
+ "/distilbert/transformer/layer.5/sa_layer_norm/Mul",
161
+ "/distilbert/transformer/layer.4/sa_layer_norm/Sub",
162
+ "/distilbert/transformer/layer.4/output_layer_norm/Add_1",
163
+ "/distilbert/transformer/layer.5/output_layer_norm/Add_1",
164
+ "/distilbert/transformer/layer.1/output_layer_norm/Add",
165
+ "/distilbert/transformer/layer.4/sa_layer_norm/Mul",
166
+ "/distilbert/transformer/layer.5/output_layer_norm/Sqrt",
167
+ "/distilbert/embeddings/Add",
168
+ "/distilbert/transformer/layer.4/output_layer_norm/Div",
169
+ "/distilbert/transformer/layer.2/sa_layer_norm/Sqrt",
170
+ "/distilbert/transformer/layer.1/output_layer_norm/Sub",
171
+ "/distilbert/transformer/layer.3/ffn/activation/Erf",
172
+ "/distilbert/transformer/layer.2/output_layer_norm/Add_1"
173
+ ],
174
+ "nodes_to_quantize": [],
175
+ "operators_to_quantize": [
176
+ "MatMul",
177
+ "Add"
178
+ ],
179
+ "per_channel": false,
180
+ "qdq_add_pair_to_weight": true,
181
+ "qdq_dedicated_pair": true,
182
+ "qdq_op_type_per_channel_support_to_axis": {
183
+ "MatMul": 1
184
+ },
185
+ "reduce_range": false,
186
+ "weights_dtype": "QInt8",
187
+ "weights_symmetric": true
188
+ },
189
+ "transformers_version": "4.27.0.dev0",
190
+ "use_external_data_format": false
191
+ }
qdq_nodes.png ADDED
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "never_split": null,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "special_tokens_map_file": null,
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "DistilBertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff