| import os | |
| import subprocess | |
| def run_vllm_inference(): | |
| # Set the necessary environment variables | |
| os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
| # vLLM serve command | |
| command = [ | |
| "vllm", "serve", "Imran1/Qwen2.5-72B-Instruct-FP8", | |
| "--tensor-parallel-size", "4", | |
| "--dtype", "auto", | |
| "--api-key", "token-abc123", | |
| "--max-model-len", "2000", | |
| "--kv-cache-dtype", "auto" | |
| ] | |
| # Run the command as a subprocess | |
| subprocess.run(command) | |
| if __name__ == "__main__": | |
| run_vllm_inference() | |