Devops-hestabit
/

mixtral-instruct-trt-quant

Model card Files Files and versions Community

mixtral-instruct-trt-quant / tensorrt_llm /1 /model.py

Devops-hestabit

Upload folder using huggingface_hub

61e6a6c verified 5 months ago

raw

history blame contribute delete

33.1 kB

	import datetime
	import json
	import os
	import time
	from threading import Lock, Thread

	import numpy as np
	import triton_python_backend_utils as pb_utils
	from torch import from_numpy

	import tensorrt_llm.bindings.executor as trtllm


	def get_input_tensor_by_name(request, name):
	tensor = pb_utils.get_input_tensor_by_name(request, name)
	if tensor is None:
	return None
	return tensor.as_numpy()


	def get_input_scalar_by_name(request, name):
	tensor = get_input_tensor_by_name(request, name)
	if tensor is None:
	return None
	if tensor.size != 1:
	raise pb_utils.TritonModelException(
	f"Expected a single value for {name}")
	return tensor.item()


	def read_parameter_as_type(value, name, pytype=str):
	if value == "":
	return None
	if value.startswith("${") and value.endswith("}"):
	return None
	if pytype is bool:
	return value.lower() in ["1", "true"]
	try:
	result = pytype(value)
	return result
	except:
	pb_utils.Logger.log_warning(
	f"Could not read parameter '{name}' with value '{value}', will use default."
	)
	return None


	def get_parameter(model_config, name, pytype=str):
	if name not in model_config['parameters']:
	return None
	return read_parameter_as_type(
	model_config['parameters'][name]['string_value'], name, pytype)


	def convert_word_list(word_list):
	if word_list is None:
	return None
	word_list = word_list.tolist()
	if len(word_list) == 0 or len(word_list[0]) != 2:
	raise pb_utils.TritonModelException(f"Invalid format for word list.")
	words, indices = word_list[0]
	result = []
	current_index = 0
	for i in indices:
	if i == -1:
	continue
	if i > len(words):
	raise pb_utils.TritonModelException(
	f"Invalid format for word list.")
	current_word = []
	while current_index < i:
	current_word.append(words[current_index])
	current_index += 1
	result.append(current_word)
	return result


	def parse_medusa_choices(medusa_choices):
	if medusa_choices is None:
	return None
	try:
	result = json.loads(
	"[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
	assert isinstance(result, list) and len(result) > 0
	assert all([isinstance(x, list) for x in result])
	assert all([isinstance(y, int) for x in result for y in x])
	except Exception:
	raise pb_utils.TritonModelException(
	"Invalid format for medusa_choices")
	return result


	def get_sampling_config_from_request(request):
	kwargs = {}
	kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1
	kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k')
	kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p')
	kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
	'top_p'] <= 0 else kwargs['top_p']
	kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed')
	kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature')
	kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length')
	kwargs['repetition_penalty'] = get_input_scalar_by_name(
	request, 'repetition_penalty')
	kwargs['presence_penalty'] = get_input_scalar_by_name(
	request, 'presence_penalty')
	kwargs['frequency_penalty'] = get_input_scalar_by_name(
	request, 'frequency_penalty')
	kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty')
	kwargs['top_p_min'] = get_input_scalar_by_name(request,
	'runtime_top_p_min')
	kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
	request, 'runtime_top_p_reset_ids')
	kwargs['top_p_decay'] = get_input_scalar_by_name(request,
	'runtime_top_p_decay')
	kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
	request, 'beam_search_diversity_rate')
	kwargs['early_stopping'] = get_input_scalar_by_name(
	request, 'early_stopping')
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	return trtllm.SamplingConfig(**kwargs)


	def get_output_config_from_request(request, exclude_input_from_output):
	kwargs = {}
	kwargs["return_log_probs"] = get_input_scalar_by_name(
	request, 'return_log_probs')
	kwargs["return_context_logits"] = get_input_scalar_by_name(
	request, 'return_context_logits')
	kwargs["return_generation_logits"] = get_input_scalar_by_name(
	request, 'return_generation_logits')
	kwargs["exclude_input_from_output"] = exclude_input_from_output
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	return trtllm.OutputConfig(**kwargs)


	def get_external_draft_tokens_config_from_request(request):
	kwargs = {}
	draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids')
	if draft_input_ids is not None:
	kwargs['tokens'] = draft_input_ids.tolist()
	draft_logits = get_input_tensor_by_name(request, 'draft_logits')
	if draft_logits is not None:
	kwargs['logits'] = from_numpy(draft_logits)
	kwargs['acceptance_threshold'] = get_input_scalar_by_name(
	request, 'draft_acceptance_threshold')
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	if len(kwargs) > 0:
	return trtllm.ExternalDraftTokensConfig(**kwargs)
	return None


	def get_prompt_tuning_config_from_request(request):
	# prompt_vocab_size is unused by executor.
	kwargs = {}
	prompt_embedding_table = get_input_tensor_by_name(
	request, 'prompt_embedding_table')
	if prompt_embedding_table is not None:
	kwargs["embedding_table"] = from_numpy(prompt_embedding_table)
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	if len(kwargs) > 0:
	return trtllm.PromptTuningConfig(**kwargs)
	return None


	def get_lora_config_from_request(request):
	kwargs = {}
	kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id')
	lora_weights = get_input_tensor_by_name(request, 'lora_weights')
	if lora_weights is not None:
	kwargs["weights"] = from_numpy(lora_weights)
	lora_config = get_input_tensor_by_name(request, 'lora_config')
	if lora_config is not None:
	kwargs["config"] = from_numpy(lora_config)
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	if len(kwargs) > 0:
	return trtllm.LoraConfig(**kwargs)
	return None


	def convert_request(request, exclude_input_from_output, decoupled):
	inputs = {}
	input_token_ids = get_input_tensor_by_name(request, 'input_ids')
	if input_token_ids is None:
	raise pb_utils.TritonModelException(
	"A value is required for input_ids")
	input_token_ids = input_token_ids.tolist()
	if len(input_token_ids) == 0:
	raise pb_utils.TritonModelException(f"Invalid format for input_ids")
	inputs['input_token_ids'] = input_token_ids[0]
	# input_lengths is not not used by executor.
	inputs['max_new_tokens'] = get_input_scalar_by_name(
	request, 'request_output_len')
	if inputs['max_new_tokens'] is None:
	raise pb_utils.TritonModelException(
	"A value is required for request_output_len")
	inputs['streaming'] = get_input_scalar_by_name(request, 'streaming')
	if inputs['streaming'] and not decoupled:
	raise pb_utils.TritonModelException(
	"Streaming is only supported in decoupled mode.")
	inputs['end_id'] = get_input_scalar_by_name(request, 'end_id')
	inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id')
	inputs['stop_words'] = convert_word_list(
	get_input_tensor_by_name(request, 'stop_words_list'))
	inputs['bad_words'] = convert_word_list(
	get_input_tensor_by_name(request, 'bad_words_list'))
	embedding_bias = get_input_tensor_by_name(request, 'embedding_bias')
	if embedding_bias is not None and embedding_bias.size != 0:
	inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()

	sampling_config = get_sampling_config_from_request(request)
	output_config = get_output_config_from_request(request,
	exclude_input_from_output)
	external_draft_tokens_config = get_external_draft_tokens_config_from_request(
	request)
	prompt_tuning_config = get_prompt_tuning_config_from_request(request)
	lora_config = get_lora_config_from_request(request)

	return trtllm.Request(
	**inputs,
	sampling_config=sampling_config,
	output_config=output_config,
	external_draft_tokens_config=external_draft_tokens_config,
	prompt_tuning_config=prompt_tuning_config,
	lora_config=lora_config,
	)


	def convert_response(response):
	if response.has_error():
	return pb_utils.InferenceResponse(output_tensors=[],
	error=pb_utils.TritonError(
	response.error_msg)), True
	result = response.result
	beam_lengths = np.expand_dims(
	np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
	max_beam_length = max([len(beam) for beam in result.output_token_ids])
	output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
	-1, np.int32)
	for idx, beam in enumerate(result.output_token_ids):
	output_ids[0, idx, :len(beam)] = beam
	output_tensors = [
	pb_utils.Tensor("output_ids", output_ids),
	pb_utils.Tensor("sequence_length", beam_lengths),
	]
	output_tensors.append(
	pb_utils.Tensor(
	"cum_log_probs",
	np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
	if result.cum_log_probs is not None else np.zeros(
	(1, 1), np.float32)))
	output_tensors.append(
	pb_utils.Tensor(
	"output_log_probs",
	np.expand_dims(np.array(result.log_probs, np.float32), 0) if
	result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
	output_tensors.append(
	pb_utils.Tensor(
	"context_logits",
	np.expand_dims(np.array(result.context_logits, np.float32), 0)
	if result.context_logits is not None else np.zeros(
	(1, 1, 1), np.float32)))
	output_tensors.append(
	pb_utils.Tensor(
	"generation_logits",
	np.expand_dims(np.array(result.generation_logits, np.float32), 0)
	if result.generation_logits is not None else np.zeros(
	(1, 1, 1, 1), np.float32)))
	return pb_utils.InferenceResponse(output_tensors), result.is_final


	def convert_scheduler_policy(batch_scheduler_policy: str):
	if batch_scheduler_policy.lower() == "max_utilization":
	return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
	elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
	return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
	raise pb_utils.TritonModelException(
	f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
	)


	def convert_batching_type(gpt_model_type: str):
	if gpt_model_type is None:
	return None
	if gpt_model_type.lower(
	) == "inflight_fused_batching" or gpt_model_type.lower(
	) == "inflight_batching":
	return trtllm.BatchingType.INFLIGHT
	elif gpt_model_type.lower() == "v1":
	return trtllm.BatchingType.STATIC
	raise pb_utils.TritonModelException(
	f"gpt_model_type value of '{gpt_model_type}' is not supported.")


	def convert_decoding_mode(decoding_mode: str):
	if decoding_mode is None:
	return None
	elif decoding_mode == "auto":
	return trtllm.DecodingMode.Auto()
	elif decoding_mode == "top_k":
	return trtllm.DecodingMode.TopK()
	elif decoding_mode == "top_p":
	return trtllm.DecodingMode.TopP()
	elif decoding_mode == "top_k_top_p":
	return trtllm.DecodingMode.TopKTopP()
	elif decoding_mode == "beam_search":
	return trtllm.DecodingMode.BeamSearch()
	elif decoding_mode == "medusa":
	return trtllm.DecodingMode.Medusa()
	raise pb_utils.TritonModelException(
	f"decoding_mode value of '{decoding_mode}' is not supported.")


	def convert_timestamp_to_seconds(timestamp: str):
	return int(
	datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp())


	class TritonPythonModel:
	"""Your Python model must use the same class name. Every Python model
	that is created must have "TritonPythonModel" as the class name.
	"""

	def get_scheduler_config(self, model_config):
	batch_scheduler_policy = get_parameter(model_config,
	"batch_scheduler_policy")
	if batch_scheduler_policy is None:
	return trtllm.SchedulerConfig()
	return trtllm.SchedulerConfig(
	convert_scheduler_policy(batch_scheduler_policy))

	def get_kv_cache_config(self, model_config):
	kwargs = {
	"enable_block_reuse":
	get_parameter(model_config, "enable_kv_cache_reuse", bool),
	"max_tokens":
	get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
	"sink_token_length":
	get_parameter(model_config, "sink_token_length", int),
	"max_attention_window":
	get_parameter(model_config, "max_attention_window_size", int),
	"free_gpu_memory_fraction":
	get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
	float),
	"host_cache_size":
	get_parameter(model_config, "kv_cache_host_memory_bytes", int),
	"onboard_blocks":
	get_parameter(model_config, "kv_cache_onboard_blocks", bool),
	}
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	return trtllm.KvCacheConfig(**kwargs)

	def get_parallel_config(self, model_config):
	kwargs = {}
	gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
	if gpu_device_ids:
	kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
	self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
	"0") == "1"
	if self.use_orchestrator_mode:
	kwargs[
	"communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
	worker_path = get_parameter(model_config, "worker_path")
	if worker_path is not None:
	raise pb_utils.TritonModelException(
	"worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
	)
	executor_worker_path = get_parameter(model_config,
	"executor_worker_path")
	kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
	True, executor_worker_path)
	if len(kwargs) > 0:
	return trtllm.ParallelConfig(**kwargs)
	return None

	def get_peft_cache_config(self, model_config):
	kwargs = {
	"optimal_adapter_size":
	get_parameter(model_config, "lora_cache_optimal_adapter_size",
	int),
	"max_adapter_size":
	get_parameter(model_config, "lora_cache_max_adapter_size", int),
	"device_cache_percent":
	get_parameter(model_config, "lora_cache_gpu_memory_fraction",
	float),
	"host_cache_size":
	get_parameter(model_config, "lora_cache_host_memory_bytes", int),
	}
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	return trtllm.PeftCacheConfig(**kwargs)

	def get_decoding_config(self, model_config):
	kwargs = {
	"medusa_choices":
	parse_medusa_choices(get_parameter(model_config,
	"medusa_choices")),
	"decoding_mode":
	convert_decoding_mode(get_parameter(model_config,
	"decoding_mode")),
	}
	print(kwargs)
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	return trtllm.DecodingConfig(**kwargs)

	def get_executor_config(self, model_config):
	kwargs = {
	"max_beam_width":
	get_parameter(model_config, "max_beam_width", int),
	"scheduler_config":
	self.get_scheduler_config(model_config),
	"kv_cache_config":
	self.get_kv_cache_config(model_config),
	"enable_chunked_context":
	get_parameter(model_config, "enable_chunked_context", bool),
	"normalize_log_probs":
	get_parameter(model_config, "normalize_log_probs", bool),
	"batching_type":
	convert_batching_type(get_parameter(model_config,
	"gpt_model_type")),
	"parallel_config":
	self.get_parallel_config(model_config),
	"peft_cache_config":
	self.get_peft_cache_config(model_config),
	"decoding_config":
	self.get_decoding_config(model_config),
	}
	kwargs = {k: v for k, v in kwargs.items() if v is not None}
	return trtllm.ExecutorConfig(**kwargs)

	def create_metrics(self, model: str, version: str, is_v1_model: bool):
	self.request_metric_family = pb_utils.MetricFamily(
	name="nv_trt_llm_request_metrics",
	description="TRT LLM request metrics",
	kind=pb_utils.MetricFamily.GAUGE,
	)
	self.runtime_memory_metric_family = pb_utils.MetricFamily(
	name="nv_trt_llm_runtime_memory_metrics",
	description="TRT LLM runtime memory metrics",
	kind=pb_utils.MetricFamily.GAUGE,
	)
	self.kv_cache_metric_family = pb_utils.MetricFamily(
	name="nv_trt_llm_kv_cache_block_metrics",
	description="TRT LLM KV cache block metrics",
	kind=pb_utils.MetricFamily.GAUGE,
	)
	model_type = "v1" if is_v1_model else "inflight_batcher"
	self.model_type_metric_family = pb_utils.MetricFamily(
	name=f"nv_trt_llm_{model_type}_metrics",
	description=f"TRT LLM {model_type}-specific metrics",
	kind=pb_utils.MetricFamily.GAUGE,
	)
	self.general_metric_family = pb_utils.MetricFamily(
	name="nv_trt_llm_general_metrics",
	description="General TRT LLM metrics",
	kind=pb_utils.MetricFamily.GAUGE,
	)
	common_labels = {"model": model, "version": version}
	self.all_metrics = {
	# Request metrics
	"num_active_requests":
	self.request_metric_family.Metric(labels={
	"request_type": "active",
	**common_labels
	}),
	"max_num_active_requests":
	self.request_metric_family.Metric(labels={
	"request_type": "max",
	**common_labels
	}),
	"num_scheduled_requests":
	self.request_metric_family.Metric(labels={
	"request_type": "scheduled",
	**common_labels
	}),
	"num_context_requests":
	self.request_metric_family.Metric(labels={
	"request_type": "context",
	**common_labels
	}),
	# Runtime metrics
	"cpu_mem_usage":
	self.runtime_memory_metric_family.Metric(labels={
	"memory_type": "cpu",
	**common_labels
	}),
	"gpu_mem_usage":
	self.runtime_memory_metric_family.Metric(labels={
	"memory_type": "gpu",
	**common_labels
	}),
	"pinned_mem_usage":
	self.runtime_memory_metric_family.Metric(labels={
	"memory_type": "pinned",
	**common_labels
	}),
	# KV cache metrics
	"max_num_blocks":
	self.kv_cache_metric_family.Metric(labels={
	"kv_cache_block_type": "max",
	**common_labels
	}),
	"free_num_blocks":
	self.kv_cache_metric_family.Metric(labels={
	"kv_cache_block_type": "free",
	**common_labels
	}),
	"used_num_blocks":
	self.kv_cache_metric_family.Metric(labels={
	"kv_cache_block_type": "used",
	**common_labels
	}),
	"tokens_per_block":
	self.kv_cache_metric_family.Metric(labels={
	"kv_cache_block_type": "tokens_per",
	**common_labels
	}),
	# General metrics
	"timestamp":
	self.general_metric_family.Metric(labels={
	"general_type": "timestamp",
	**common_labels
	}),
	"iter":
	self.general_metric_family.Metric(labels={
	"general_type": "iteration_counter",
	**common_labels
	}),
	}
	if is_v1_model:
	self.all_metrics.update({
	"num_ctx_tokens":
	self.model_type_metric_family.Metric(labels={
	"v1_specific_metric": "total_context_tokens",
	**common_labels
	}),
	"num_gen_tokens":
	self.model_type_metric_family.Metric(
	labels={
	"v1_specific_metric": "total_generation_tokens",
	**common_labels
	}),
	"empty_gen_slots":
	self.model_type_metric_family.Metric(
	labels={
	"v1_specific_metric": "empty_generation_slots",
	**common_labels
	}),
	})
	else:
	self.all_metrics.update({
	"num_ctx_tokens":
	self.model_type_metric_family.Metric(
	labels={
	"inflight_batcher_specific_metric":
	"total_context_tokens",
	**common_labels
	}),
	"num_gen_requests":
	self.model_type_metric_family.Metric(
	labels={
	"inflight_batcher_specific_metric":
	"generation_requests",
	**common_labels
	}),
	"micro_batch_id":
	self.model_type_metric_family.Metric(
	labels={
	"inflight_batcher_specific_metric": "micro_batch_id",
	**common_labels
	}),
	"num_paused_requests":
	self.model_type_metric_family.Metric(
	labels={
	"inflight_batcher_specific_metric": "paused_requests",
	**common_labels
	}),
	})

	def initialize(self, args):
	"""`initialize` is called only once when the model is being loaded.
	Implementing `initialize` function is optional. This function allows
	the model to initialize any state associated with this model.

	Parameters
	----------
	args : dict
	Both keys and values are strings. The dictionary keys and values are:
	* model_config: A JSON string containing the model configuration
	* model_instance_kind: A string containing model instance kind
	* model_instance_device_id: A string containing model instance device ID
	* model_repository: Model repository path
	* model_version: Model version
	* model_name: Model name
	"""
	model_config = json.loads(args['model_config'])
	gpt_model_path = get_parameter(model_config, "gpt_model_path")
	if get_parameter(model_config, "enable_trt_overlap", bool):
	raise pb_utils.TritonModelException(
	f"enable_trt_overlap=true is not supported.")
	self.exclude_input_from_output = get_parameter(
	model_config, "exclude_input_in_output", bool)
	executor_config = self.get_executor_config(model_config)
	self.executor = trtllm.Executor(gpt_model_path,
	trtllm.ModelType.DECODER_ONLY,
	executor_config)
	self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
	model_config)
	self.cancellation_check_period_ms = get_parameter(
	model_config, "cancellation_check_period_ms", int) or 100
	self.stats_check_period_ms = get_parameter(
	model_config, "stats_check_period_ms", int) or 100

	if not self.decoupled:
	raise pb_utils.TritonModelException(
	"Please enable decoupled transaction policy in the model configuration to serve this model"
	)

	self.create_metrics(args["model_name"],
	args["model_version"],
	is_v1_model=executor_config.batching_type ==
	trtllm.BatchingType.STATIC)
	self.triton_id_to_req_id = {}
	self.req_id_to_response_sender = {}
	self.lock = Lock()
	self.running = False
	self.awaiter_thread = Thread(target=self.awaiter_loop)
	self.cancellation_thread = Thread(target=self.cancellation_loop)
	self.metrics_thread = Thread(target=self.metrics_loop)
	if self.executor.can_enqueue_requests():
	self.running = True
	self.awaiter_thread.start()
	self.cancellation_thread.start()
	self.metrics_thread.start()
	else:
	# In leader mode, worker ranks will wait here until leader is done.
	self.executor.shutdown()

	def handle_stop_request(self, triton_id, response_sender):
	if triton_id is None or triton_id == "":
	response_sender.send(
	pb_utils.InferenceResponse(error=pb_utils.TritonError(
	"A request id must be provided for request cancellation")),
	flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
	return

	if triton_id in self.triton_id_to_req_id:
	req_id = self.triton_id_to_req_id[triton_id]
	self.executor.cancel_request(req_id)

	response_sender.send(
	pb_utils.InferenceResponse(),
	flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)

	def execute(self, requests):
	"""`execute` must be implemented in every Python model. `execute`
	function receives a list of pb_utils.InferenceRequest as the only
	argument. This function is called when an inference is requested
	for this model.

	Parameters
	----------
	requests : list
	A list of pb_utils.InferenceRequest

	Returns
	-------
	list
	A list of pb_utils.InferenceResponse. The length of this list must
	be the same as `requests`
	"""
	if not self.executor.can_enqueue_requests():
	return

	# Convert to executor requests.
	triton_requests = []
	executor_requests = []
	for request in requests:
	response_sender = request.get_response_sender()
	if get_input_scalar_by_name(request, 'stop'):
	self.handle_stop_request(request.request_id(), response_sender)
	else:
	try:
	converted = convert_request(request,
	self.exclude_input_from_output,
	self.decoupled)
	except Exception as e:
	response_sender.send(
	pb_utils.InferenceResponse(error=pb_utils.TritonError(
	f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
	)),
	flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
	else:
	triton_requests.append(request)
	executor_requests.append(converted)

	with self.lock:
	request_ids = self.executor.enqueue_requests(executor_requests)
	for req_id, request in zip(request_ids, triton_requests):
	triton_id = request.request_id()
	self.req_id_to_response_sender[
	req_id] = triton_id, request.get_response_sender()
	self.triton_id_to_req_id[triton_id] = req_id
	return None

	def awaiter_loop(self):
	"""Gets responses from executor and returns the results."""
	while self.running:
	for response in self.executor.await_responses(
	timeout=datetime.timedelta(milliseconds=1)):
	req_id = response.request_id
	with self.lock:
	if req_id not in self.req_id_to_response_sender:
	continue
	triton_id, response_sender = self.req_id_to_response_sender[
	req_id]

	triton_response, is_final = convert_response(response)
	response_sender.send(
	triton_response,
	flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
	if is_final else 0)

	if is_final:
	with self.lock:
	del self.triton_id_to_req_id[triton_id]
	del self.req_id_to_response_sender[req_id]
	# Remove local reference so response_sender can be cleaned properly.
	del response_sender

	def cancellation_loop(self):
	"""Checks if any pending requests have been cancelled."""
	while self.running:
	time.sleep(self.cancellation_check_period_ms / 1000.0)
	with self.lock:
	for req_id, (triton_id, response_sender
	) in self.req_id_to_response_sender.items():
	if response_sender.is_cancelled():
	self.executor.cancel_request(req_id)
	# Remove local reference so response_sender can be cleaned properly.
	del response_sender

	def metrics_loop(self):
	"""Updates triton metrics using stats from the executor."""
	while self.running:
	time.sleep(self.stats_check_period_ms / 1000.0)
	for stat in self.executor.get_latest_iteration_stats():
	try:
	for key, metric in self.all_metrics.items():
	value = None
	if hasattr(stat, key):
	value = getattr(stat, key)
	elif stat.kv_cache_stats is not None and hasattr(
	stat.kv_cache_stats, key):
	value = getattr(stat.kv_cache_stats, key)
	elif stat.static_batching_stats is not None and hasattr(
	stat.static_batching_stats, key):
	value = getattr(stat.static_batching_stats, key)
	elif stat.inflight_batching_stats is not None and hasattr(
	stat.inflight_batching_stats, key):
	value = getattr(stat.inflight_batching_stats, key)
	if value is not None:
	if key == "timestamp":
	value = convert_timestamp_to_seconds(value)
	metric.set(value)
	else:
	pb_utils.Logger.log_warn(
	f"Metric \"{key}\" not found.")
	except Exception as e:
	pb_utils.Logger.log_warn(
	f"Error while processing metrics: {e}")

	def finalize(self):
	"""`finalize` is called only once when the model is being unloaded.
	Implementing `finalize` function is optional. This function allows
	the model to perform any necessary clean ups before exit.
	"""
	if self.executor.can_enqueue_requests():
	self.running = False
	self.awaiter_thread.join()
	self.cancellation_thread.join()
	self.metrics_thread.join()
	self.executor.shutdown()