File size: 552 Bytes
3c5ff26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import os
import subprocess
def run_vllm_inference():
# Set the necessary environment variables
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# vLLM serve command
command = [
"vllm", "serve", "Imran1/Qwen2.5-72B-Instruct-FP8",
"--tensor-parallel-size", "4",
"--dtype", "auto",
"--api-key", "token-abc123",
"--max-model-len", "2000",
"--kv-cache-dtype", "auto"
]
# Run the command as a subprocess
subprocess.run(command)
if __name__ == "__main__":
run_vllm_inference()
|