README.md · muhtasham/llama3-ins-8b-int4-trt-llm at 5413819bc14bcc003dbb6ab90d64b249e2aaf96c

git clone https://github.com/NVIDIA/TensorRT-LLM.git

python ./TensorRT-LLM/examples/run.py --engine_dir=./ \
  --max_output_len 5 \
  --tokenizer_dir llama3-hf \
  --input_text "How do I count to nine in French?" \
  --run_profiling

2024-04-25 19:35:59.062455: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
Input [Text 0]: "<|begin_of_text|>How do I count to nine in French?"
Output [Text 0 Beam 0]: " Counting in French is"
batch_size: 1, avg latency of 10 iterations: : 0.0999948501586914 sec