Spaces:

Elfsong
/

Arena

Build error

Arena / launch_models.py

feat: Add scripts for launching models with vLLM, including dynamic GPU allocation and logging functionality for better monitoring and management of model processes.

04ffcc8 3 months ago

raw

history blame contribute delete

2.46 kB

	#!/usr/bin/env python3
	# coding: utf-8

	# Author: Du Mingzhe (dumingzhex@gmail.com)
	# Date: 2025-02-03

	import os
	import subprocess
	import time
	from pathlib import Path

	# Create logs directory
	Path("./logs").mkdir(exist_ok=True)

	# Launch models via vLLM
	model_gpu_mapping = [
	# (0, 1000),
	# (0, 1500),
	# (1, 2000),
	# (1, 2500),
	# (2, 3000),
	(1, 3500),
	# (1, 4000),
	(1, 4500),
	# (1, 5000),
	(1, 5500),
	# (2, 6000),
	(3, 6500),
	# (3, 7000),
	(3, 7500),
	]

	launched_models = []

	for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping):
	formatted_iter_num = f"{iter_num:07d}"
	model_name = f"Elfsong/VLM_stage_2_iter_{formatted_iter_num}"
	arena_key = f"Local-Model-{iter_num:05d}"

	port = 9000 + index
	print(f"🚀 Launching {model_name} on port {port} (GPU {gpu_id}) ...")
	log_file = open(f"./logs/vllm_{formatted_iter_num}.log", "w")

	process = subprocess.Popen(
	[
	"python", "-m", "vllm.entrypoints.openai.api_server",
	"--model", model_name,
	"--port", str(port),
	"--quantization", "bitsandbytes",
	"--gpu-memory-utilization", "0.3",
	"--max-model-len", "4096",
	"--trust-remote-code",
	],
	env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)},
	stdout=log_file,
	stderr=log_file,
	)

	launched_models.append({
	"process": process,
	"model_name": model_name,
	"port": port,
	"gpu_id": gpu_id,
	"arena_key": arena_key,
	"log_file": log_file
	})

	time.sleep(10) # Wait for initialization

	print(f"✅ Launched {len(launched_models)} models. Check logs in ./logs/ directory.")

	# Keep the script running and monitor processes
	try:
	print("Models are running. Press Ctrl+C to stop all models.")
	while True:
	time.sleep(60)
	# Check if any processes have died
	for model in launched_models:
	if model["process"].poll() is not None:
	print(f"⚠️ Model {model['model_name']} (port {model['port']}) has stopped.")
	except KeyboardInterrupt:
	print("\n🛑 Stopping all models...")
	for model in launched_models:
	if model["process"].poll() is None:
	print(f"Stopping {model['model_name']} (port {model['port']})...")
	model["process"].terminate()
	model["log_file"].close()
	print("✅ All models stopped.")