Imran1
/

Qwen2.5-72B-Instruct-FP8

Model card Files Files and versions

Qwen2.5-72B-Instruct-FP8 / code /serve.py

Imran1's picture

Create serve.py

3c5ff26 verified about 1 year ago

552 Bytes

	import os
	import subprocess

	def run_vllm_inference():
	# Set the necessary environment variables
	os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

	# vLLM serve command
	command = [
	"vllm", "serve", "Imran1/Qwen2.5-72B-Instruct-FP8",
	"--tensor-parallel-size", "4",
	"--dtype", "auto",
	"--api-key", "token-abc123",
	"--max-model-len", "2000",
	"--kv-cache-dtype", "auto"
	]

	# Run the command as a subprocess
	subprocess.run(command)

	if __name__ == "__main__":
	run_vllm_inference()