feat: ONNX model
Browse filesSigned-off-by: Thytu <valentin.de-matos@epitech.eu>
- sentence-transformers_all-MiniLM-L6-v2_onnx_inference/config.pbtxt +66 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_model/1/model.bin +3 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_model/config.pbtxt +35 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/config.json +25 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/model.py +70 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/special_tokens_map.json +7 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/tokenizer.json +0 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/tokenizer_config.json +16 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/vocab.txt +0 -0
- sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/config.pbtxt +36 -0
sentence-transformers_all-MiniLM-L6-v2_onnx_inference/config.pbtxt
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "sentence-transformers_all-MiniLM-L6-v2_onnx_inference"
|
2 |
+
max_batch_size: 0
|
3 |
+
platform: "ensemble"
|
4 |
+
|
5 |
+
input [
|
6 |
+
{
|
7 |
+
name: "TEXT"
|
8 |
+
data_type: TYPE_STRING
|
9 |
+
dims: [ -1 ]
|
10 |
+
}
|
11 |
+
]
|
12 |
+
|
13 |
+
output {
|
14 |
+
name: "output"
|
15 |
+
data_type: TYPE_FP32
|
16 |
+
dims: [-1, 384]
|
17 |
+
}
|
18 |
+
|
19 |
+
ensemble_scheduling {
|
20 |
+
step [
|
21 |
+
{
|
22 |
+
model_name: "sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize"
|
23 |
+
model_version: -1
|
24 |
+
input_map {
|
25 |
+
key: "TEXT"
|
26 |
+
value: "TEXT"
|
27 |
+
}
|
28 |
+
output_map [
|
29 |
+
{
|
30 |
+
key: "input_ids"
|
31 |
+
value: "input_ids"
|
32 |
+
},
|
33 |
+
{
|
34 |
+
key: "token_type_ids"
|
35 |
+
value: "token_type_ids"
|
36 |
+
},
|
37 |
+
{
|
38 |
+
key: "attention_mask"
|
39 |
+
value: "attention_mask"
|
40 |
+
}
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
model_name: "sentence-transformers_all-MiniLM-L6-v2_onnx_model"
|
45 |
+
model_version: -1
|
46 |
+
input_map [
|
47 |
+
{
|
48 |
+
key: "input_ids"
|
49 |
+
value: "input_ids"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
key: "token_type_ids"
|
53 |
+
value: "token_type_ids"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
key: "attention_mask"
|
57 |
+
value: "attention_mask"
|
58 |
+
}
|
59 |
+
]
|
60 |
+
output_map {
|
61 |
+
key: "output"
|
62 |
+
value: "output"
|
63 |
+
}
|
64 |
+
}
|
65 |
+
]
|
66 |
+
}
|
sentence-transformers_all-MiniLM-L6-v2_onnx_model/1/model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70c610821ee50fff50dce1bb7e7592f2d31b24b50833e1145908c3b2a3aa57d5
|
3 |
+
size 46475704
|
sentence-transformers_all-MiniLM-L6-v2_onnx_model/config.pbtxt
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "sentence-transformers_all-MiniLM-L6-v2_onnx_model"
|
2 |
+
max_batch_size: 0
|
3 |
+
platform: "onnxruntime_onnx"
|
4 |
+
default_model_filename: "model.bin"
|
5 |
+
|
6 |
+
input [
|
7 |
+
{
|
8 |
+
name: "input_ids"
|
9 |
+
data_type: TYPE_INT32
|
10 |
+
dims: [-1, -1]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
name: "token_type_ids"
|
14 |
+
data_type: TYPE_INT32
|
15 |
+
dims: [-1, -1]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
name: "attention_mask"
|
19 |
+
data_type: TYPE_INT32
|
20 |
+
dims: [-1, -1]
|
21 |
+
}
|
22 |
+
]
|
23 |
+
|
24 |
+
output {
|
25 |
+
name: "output"
|
26 |
+
data_type: TYPE_FP32
|
27 |
+
dims: [-1, 384]
|
28 |
+
}
|
29 |
+
|
30 |
+
instance_group [
|
31 |
+
{
|
32 |
+
count: 1
|
33 |
+
kind: KIND_GPU
|
34 |
+
}
|
35 |
+
]
|
sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 384,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 1536,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 6,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.24.0",
|
22 |
+
"type_vocab_size": 2,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 30522
|
25 |
+
}
|
sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/model.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2022, Lefebvre Dalloz Services
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
"""
|
16 |
+
This module is copy-pasted in generated Triton configuration folder to perform the tokenization step.
|
17 |
+
"""
|
18 |
+
|
19 |
+
# noinspection DuplicatedCode
|
20 |
+
import os
|
21 |
+
from typing import Dict, List
|
22 |
+
|
23 |
+
import numpy as np
|
24 |
+
|
25 |
+
|
26 |
+
try:
|
27 |
+
# noinspection PyUnresolvedReferences
|
28 |
+
import triton_python_backend_utils as pb_utils
|
29 |
+
except ImportError:
|
30 |
+
pass # triton_python_backend_utils exists only inside Triton Python backend.
|
31 |
+
|
32 |
+
from transformers import AutoTokenizer, PreTrainedTokenizer, TensorType
|
33 |
+
|
34 |
+
|
35 |
+
class TritonPythonModel:
|
36 |
+
tokenizer: PreTrainedTokenizer
|
37 |
+
|
38 |
+
def initialize(self, args: Dict[str, str]) -> None:
|
39 |
+
"""
|
40 |
+
Initialize the tokenization process
|
41 |
+
:param args: arguments from Triton config file
|
42 |
+
"""
|
43 |
+
# more variables in https://github.com/triton-inference-server/python_backend/blob/main/src/python.cc
|
44 |
+
path: str = os.path.join(args["model_repository"], args["model_version"])
|
45 |
+
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
46 |
+
|
47 |
+
def execute(self, requests) -> "List[List[pb_utils.Tensor]]":
|
48 |
+
"""
|
49 |
+
Parse and tokenize each request
|
50 |
+
:param requests: 1 or more requests received by Triton server.
|
51 |
+
:return: text as input tensors
|
52 |
+
"""
|
53 |
+
responses = []
|
54 |
+
# for loop for batch requests (disabled in our case)
|
55 |
+
for request in requests:
|
56 |
+
# binary data typed back to string
|
57 |
+
query = [t.decode("UTF-8") for t in pb_utils.get_input_tensor_by_name(request, "TEXT").as_numpy().tolist()]
|
58 |
+
tokens: Dict[str, np.ndarray] = self.tokenizer(text=query, return_tensors=TensorType.NUMPY)
|
59 |
+
# tensorrt uses int32 as input type, ort uses int64
|
60 |
+
tokens = {k: v.astype(np.int32) for k, v in tokens.items()}
|
61 |
+
# communicate the tokenization results to Triton server
|
62 |
+
outputs = list()
|
63 |
+
for input_name in self.tokenizer.model_input_names:
|
64 |
+
tensor_input = pb_utils.Tensor(input_name, tokens[input_name])
|
65 |
+
outputs.append(tensor_input)
|
66 |
+
|
67 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
|
68 |
+
responses.append(inference_response)
|
69 |
+
|
70 |
+
return responses
|
sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/tokenizer_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_basic_tokenize": true,
|
4 |
+
"do_lower_case": true,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"model_max_length": 512,
|
7 |
+
"name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"special_tokens_map_file": "/root/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/7dbbc90392e2f80f3d3c277d6e90027e55de9125/special_tokens_map.json",
|
12 |
+
"strip_accents": null,
|
13 |
+
"tokenize_chinese_chars": true,
|
14 |
+
"tokenizer_class": "BertTokenizer",
|
15 |
+
"unk_token": "[UNK]"
|
16 |
+
}
|
sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/1/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize/config.pbtxt
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "sentence-transformers_all-MiniLM-L6-v2_onnx_tokenize"
|
2 |
+
max_batch_size: 0
|
3 |
+
backend: "python"
|
4 |
+
|
5 |
+
input [
|
6 |
+
{
|
7 |
+
name: "TEXT"
|
8 |
+
data_type: TYPE_STRING
|
9 |
+
dims: [ -1 ]
|
10 |
+
}
|
11 |
+
]
|
12 |
+
|
13 |
+
output [
|
14 |
+
{
|
15 |
+
name: "input_ids"
|
16 |
+
data_type: TYPE_INT32
|
17 |
+
dims: [-1, -1]
|
18 |
+
},
|
19 |
+
{
|
20 |
+
name: "token_type_ids"
|
21 |
+
data_type: TYPE_INT32
|
22 |
+
dims: [-1, -1]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
name: "attention_mask"
|
26 |
+
data_type: TYPE_INT32
|
27 |
+
dims: [-1, -1]
|
28 |
+
}
|
29 |
+
]
|
30 |
+
|
31 |
+
instance_group [
|
32 |
+
{
|
33 |
+
count: 1
|
34 |
+
kind: KIND_GPU
|
35 |
+
}
|
36 |
+
]
|