amd
/

Meta-Llama-3.1-405B-Instruct-fp8-quark-vllm

Model card Files Files and versions Community

seungrok81 commited on Aug 14

Commit

1dc70e5

•

1 Parent(s): 972b266

Update README.md

Files changed (1) hide show

README.md +3 -3

README.md CHANGED Viewed

@@ -43,7 +43,7 @@ torchrun --standalone --nproc_per_node=8 run_vllm_fp8.py
 from vllm import LLM, SamplingParams
 prompt = "Write me an essay about bear and knight"
-model_name="/workspace/models--meta-llama--Meta-Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26/"
 tp=8 # 8 GPUs
 model = LLM(model=model_name, tensor_parallel_size=tp, max_model_len=8192, trust_remote_code=True, dtype="float16", quantization="fp8", quantized_weights_path="/llama.safetensors")
@@ -59,7 +59,7 @@ print(result)
 ```sh
 # 8 GPUs
-torchrun --standalone --nproc_per_node=8 run_vllm_fp8.py
 ```
 ```python
@@ -67,7 +67,7 @@ torchrun --standalone --nproc_per_node=8 run_vllm_fp8.py
 from vllm import LLM, SamplingParams
 prompt = "Write me an essay about bear and knight"
-model_name="/workspace/models--meta-llama--Meta-Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26/"
 tp=8 # 8 GPUs
 model = LLM(model=model_name, tensor_parallel_size=tp, max_model_len=8192, trust_remote_code=True, dtype="bfloat16")
 sampling_params = SamplingParams(

 from vllm import LLM, SamplingParams
 prompt = "Write me an essay about bear and knight"
+model_name="models--meta-llama--Meta-Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26/"
 tp=8 # 8 GPUs
 model = LLM(model=model_name, tensor_parallel_size=tp, max_model_len=8192, trust_remote_code=True, dtype="float16", quantization="fp8", quantized_weights_path="/llama.safetensors")
 ```sh
 # 8 GPUs
+torchrun --standalone --nproc_per_node=8 run_vllm_fp16.py
 ```
 ```python
 from vllm import LLM, SamplingParams
 prompt = "Write me an essay about bear and knight"
+model_name="models--meta-llama--Meta-Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26/"
 tp=8 # 8 GPUs
 model = LLM(model=model_name, tensor_parallel_size=tp, max_model_len=8192, trust_remote_code=True, dtype="bfloat16")
 sampling_params = SamplingParams(