DeepInfra
/

Llama-2-70b-chat-hf-trt-fp8

Model card Files Files and versions Community

Llama-2-70b-chat-hf-trt-fp8 / tensorrt_llm /config.pbtxt

yessenzhar

update models for newer trt 0.6.1 version

dd20dba about 1 year ago

raw

history blame

6.01 kB

	# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	# * Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	# * Neither the name of NVIDIA CORPORATION nor the names of its
	# contributors may be used to endorse or promote products derived
	# from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	name: "tensorrt_llm"
	backend: "tensorrtllm"
	max_batch_size: 64

	model_transaction_policy {
	decoupled: True
	}

	dynamic_batching {
	preferred_batch_size: [ 1 ]
	max_queue_delay_microseconds: 1000
	}

	input [
	{
	name: "input_ids"
	data_type: TYPE_INT32
	dims: [ -1 ]
	allow_ragged_batch: true
	},
	{
	name: "input_lengths"
	data_type: TYPE_INT32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	},
	{
	name: "request_output_len"
	data_type: TYPE_INT32
	dims: [ 1 ]
	},
	{
	name: "draft_input_ids"
	data_type: TYPE_INT32
	dims: [ -1 ]
	optional: true
	allow_ragged_batch: true
	},
	{
	name: "end_id"
	data_type: TYPE_INT32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "pad_id"
	data_type: TYPE_INT32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "stop_words_list"
	data_type: TYPE_INT32
	dims: [ 2, -1 ]
	optional: true
	allow_ragged_batch: true
	},
	{
	name: "bad_words_list"
	data_type: TYPE_INT32
	dims: [ 2, -1 ]
	optional: true
	allow_ragged_batch: true
	},
	{
	name: "embedding_bias"
	data_type: TYPE_FP32
	dims: [ -1 ]
	optional: true
	allow_ragged_batch: true
	},
	{
	name: "beam_width"
	data_type: TYPE_INT32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "temperature"
	data_type: TYPE_FP32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "runtime_top_k"
	data_type: TYPE_INT32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "runtime_top_p"
	data_type: TYPE_FP32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "len_penalty"
	data_type: TYPE_FP32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "repetition_penalty"
	data_type: TYPE_FP32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "min_length"
	data_type: TYPE_INT32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "presence_penalty"
	data_type: TYPE_FP32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "random_seed"
	data_type: TYPE_UINT64
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "return_log_probs"
	data_type: TYPE_BOOL
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	},
	{
	name: "stop"
	data_type: TYPE_BOOL
	dims: [ 1 ]
	optional: true
	},
	{
	name: "streaming"
	data_type: TYPE_BOOL
	dims: [ 1 ]
	optional: true
	},
	{
	name: "prompt_embedding_table"
	data_type: TYPE_FP16
	dims: [ -1, -1 ]
	optional: true
	allow_ragged_batch: true
	},
	{
	name: "prompt_vocab_size"
	data_type: TYPE_INT32
	dims: [ 1 ]
	reshape: { shape: [ ] }
	optional: true
	}
	]
	output [
	{
	name: "output_ids"
	data_type: TYPE_INT32
	dims: [ -1, -1 ]
	},
	{
	name: "sequence_length"
	data_type: TYPE_INT32
	dims: [ -1 ]
	},
	{
	name: "cum_log_probs"
	data_type: TYPE_FP32
	dims: [ -1 ]
	},
	{
	name: "output_log_probs"
	data_type: TYPE_FP32
	dims: [ -1, -1 ]
	}
	]
	instance_group [
	{
	count: 1
	kind : KIND_CPU
	}
	]
	parameters: {
	key: "max_beam_width"
	value: {
	string_value: "1"
	}
	}
	parameters: {
	key: "FORCE_CPU_ONLY_INPUT_TENSORS"
	value: {
	string_value: "no"
	}
	}
	parameters: {
	key: "gpt_model_type"
	value: {
	string_value: "inflight_fused_batching"
	}
	}
	parameters: {
	key: "gpt_model_path"
	value: {
	string_value: "/data/tgi-data/yessen/Llama-2-70b-chat-hf-trt-fp8/tensorrt_llm/1"
	}
	}
	parameters: {
	key: "max_tokens_in_paged_kv_cache"
	value: {
	string_value: "40000"
	}
	}
	parameters: {
	key: "max_attention_window_size"
	value: {
	string_value: "4096"
	}
	}
	parameters: {
	key: "batch_scheduler_policy"
	value: {
	string_value: "max_utilization"
	}
	}
	parameters: {
	key: "kv_cache_free_gpu_mem_fraction"
	value: {
	string_value: "0.9"
	}
	}
	parameters: {
	key: "max_num_sequences"
	value: {
	string_value: "64"
	}
	}
	parameters: {
	key: "enable_trt_overlap"
	value: {
	string_value: "false"
	}
	}
	parameters: {
	key: "exclude_input_in_output"
	value: {
	string_value: "true"
	}
	}
	parameters: {
	key: "enable_kv_cache_reuse"
	value: {
	string_value: "${enable_kv_cache_reuse}"
	}
	}