git clone https://github.com/NVIDIA/TensorRT-LLM.git
python ./TensorRT-LLM/examples/run.py --engine_dir=./ \
--max_output_len 5 \
--tokenizer_dir llama3-hf \
--input_text "How do I count to nine in French?" \
--run_profiling
2024-04-25 19:35:59.062455: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
Input [Text 0]: "<|begin_of_text|>How do I count to nine in French?"
Output [Text 0 Beam 0]: " Counting in French is"
batch_size: 1, avg latency of 10 iterations: : 0.0999948501586914 sec