Gladiaio
/

sentence-transformers_all-MiniLM-L12-v2_onnx

Model card Files Files and versions Community

sentence-transformers_all-MiniLM-L12-v2_onnx / paraphrase-multilingual-MiniLM-L12-v2_tokenizer_onnx /1 /model.py

Thytu

feat: paraphrase-multilingual-MiniLM-L12-v2_*_onnx/

87b4d0f over 1 year ago

raw

history blame contribute delete

No virus

2.95 kB

	# Copyright 2022, Lefebvre Dalloz Services
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	This module is copy-pasted in generated Triton configuration folder to perform the tokenization step.
	"""

	# noinspection DuplicatedCode
	from pathlib import Path
	from typing import Dict, List

	import numpy as np


	try:
	# noinspection PyUnresolvedReferences
	import triton_python_backend_utils as pb_utils
	except ImportError:
	pass # triton_python_backend_utils exists only inside Triton Python backend.

	from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizer, TensorType


	class TritonPythonModel:
	tokenizer: PreTrainedTokenizer

	def initialize(self, args: Dict[str, str]) -> None:
	"""
	Initialize the tokenization process
	:param args: arguments from Triton config file
	"""
	# more variables in https://github.com/triton-inference-server/python_backend/blob/main/src/python.cc

	path: str = str(Path(args["model_repository"]).parent.absolute())
	path: str = str(Path(args["model_repository"]).absolute()) + "/1/"

	self.tokenizer = AutoTokenizer.from_pretrained(path)

	def execute(self, requests) -> "List[List[pb_utils.Tensor]]":
	"""
	Parse and tokenize each request
	:param requests: 1 or more requests received by Triton server.
	:return: text as input tensors
	"""
	responses = []
	# for loop for batch requests (disabled in our case)
	for request in requests:
	# binary data typed back to string
	query = [t.decode("UTF-8") for t in pb_utils.get_input_tensor_by_name(request, "TEXT").as_numpy().tolist()]
	tokens: BatchEncoding = self.tokenizer(
	text=query, return_tensors=TensorType.NUMPY, padding=True, pad_to_multiple_of=8
	)
	# tensorrt uses int32 as input type, ort uses int64
	tokens_dict = {k: v.astype(np.int32) for k, v in tokens.items()}
	# communicate the tokenization results to Triton server
	outputs = list()
	for input_name in self.tokenizer.model_input_names:
	tensor_input = pb_utils.Tensor(input_name, tokens_dict[input_name])
	outputs.append(tensor_input)

	inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
	responses.append(inference_response)

	return responses