Spaces:
Runtime error
Runtime error
# Multi-GPU Neural OS Startup Script | |
# Function to detect number of GPUs automatically | |
detect_gpu_count() { | |
if command -v nvidia-smi >/dev/null 2>&1; then | |
# Use nvidia-smi to count GPUs | |
local gpu_count=$(nvidia-smi -L 2>/dev/null | wc -l) | |
if [ "$gpu_count" -gt 0 ]; then | |
echo "$gpu_count" | |
return 0 | |
fi | |
fi | |
# If nvidia-smi fails, try alternative methods | |
if [ -d "/proc/driver/nvidia/gpus" ]; then | |
local gpu_count=$(ls -d /proc/driver/nvidia/gpus/*/information 2>/dev/null | wc -l) | |
if [ "$gpu_count" -gt 0 ]; then | |
echo "$gpu_count" | |
return 0 | |
fi | |
fi | |
# Default fallback | |
echo "1" | |
return 1 | |
} | |
# Detect GPU count automatically | |
DETECTED_GPUS=$(detect_gpu_count) | |
GPU_DETECTION_SUCCESS=$? | |
# Default values | |
NUM_GPUS=$DETECTED_GPUS | |
DISPATCHER_PORT=7860 | |
# Parse command line arguments | |
while [[ $# -gt 0 ]]; do | |
case $1 in | |
--num-gpus) | |
NUM_GPUS="$2" | |
shift 2 | |
;; | |
--port) | |
DISPATCHER_PORT="$2" | |
shift 2 | |
;; | |
-h|--help) | |
echo "Usage: $0 [--num-gpus N] [--port PORT]" | |
echo " --num-gpus N Number of GPU workers to start (default: auto-detected)" | |
echo " --port PORT Dispatcher port (default: 7860)" | |
echo "" | |
echo "GPU Detection:" | |
echo " Automatically detects available GPUs using nvidia-smi" | |
echo " Currently detected: $DETECTED_GPUS GPU(s)" | |
if [ $GPU_DETECTION_SUCCESS -ne 0 ]; then | |
echo " β οΈ GPU detection failed - using fallback of 1 GPU" | |
fi | |
exit 0 | |
;; | |
*) | |
echo "Unknown option: $1" >&2 | |
exit 1 | |
;; | |
esac | |
done | |
# Function to cleanup background processes | |
cleanup() { | |
echo "" | |
echo "π Shutting down system..." | |
# Kill dispatcher | |
if [[ -n $DISPATCHER_PID ]]; then | |
echo "Stopping dispatcher (PID: $DISPATCHER_PID)..." | |
kill $DISPATCHER_PID 2>/dev/null | |
wait $DISPATCHER_PID 2>/dev/null | |
fi | |
# Kill workers by finding their processes | |
echo "Stopping workers..." | |
pkill -f "python.*worker.py.*--worker-address" 2>/dev/null || true | |
sleep 2 | |
# Force kill if any are still running | |
pkill -9 -f "python.*worker.py.*--worker-address" 2>/dev/null || true | |
echo "β System stopped" | |
exit 0 | |
} | |
# Set up signal handlers | |
trap cleanup SIGINT SIGTERM | |
echo "π Starting Multi-GPU Neural OS System" | |
echo "========================================" | |
echo "π GPU Detection: $DETECTED_GPUS GPU(s) detected" | |
if [ $GPU_DETECTION_SUCCESS -ne 0 ]; then | |
echo "β οΈ GPU detection failed - using fallback count" | |
elif command -v nvidia-smi >/dev/null 2>&1; then | |
echo "π Detected GPUs:" | |
nvidia-smi -L 2>/dev/null | sed 's/^/ /' | |
fi | |
echo "π Number of GPUs: $NUM_GPUS" | |
echo "π Dispatcher port: $DISPATCHER_PORT" | |
echo "π» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))" | |
echo "π Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log" | |
echo "" | |
# Validate that we're not trying to start more workers than GPUs | |
if [ "$NUM_GPUS" -gt "$DETECTED_GPUS" ]; then | |
echo "β οΈ Warning: Trying to start $NUM_GPUS workers but only $DETECTED_GPUS GPU(s) detected" | |
echo " This may cause GPU sharing or errors. Consider using --num-gpus $DETECTED_GPUS" | |
echo "" | |
fi | |
# Check if required files exist | |
if [[ ! -f "dispatcher.py" ]]; then | |
echo "β Error: dispatcher.py not found" | |
exit 1 | |
fi | |
if [[ ! -f "worker.py" ]]; then | |
echo "β Error: worker.py not found" | |
exit 1 | |
fi | |
if [[ ! -f "start_workers.py" ]]; then | |
echo "β Error: start_workers.py not found" | |
exit 1 | |
fi | |
# Start dispatcher | |
echo "π― Starting dispatcher..." | |
python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 & | |
DISPATCHER_PID=$! | |
# Wait a bit for dispatcher to start | |
sleep 3 | |
# Check if dispatcher started successfully | |
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then | |
echo "β Failed to start dispatcher. Check dispatcher.log for errors." | |
exit 1 | |
fi | |
echo "β Dispatcher started (PID: $DISPATCHER_PID)" | |
# Start workers | |
echo "π§ Starting $NUM_GPUS GPU workers..." | |
python start_workers.py --num-gpus $NUM_GPUS --dispatcher-url "http://localhost:$DISPATCHER_PORT" --no-monitor > workers.log 2>&1 | |
WORKER_START_EXIT_CODE=$? | |
# Wait for workers to fully load models and register (60 seconds) | |
echo "β³ Waiting 60 seconds for workers to load models and register..." | |
sleep 60 | |
# Check if workers started successfully by checking the exit code and log | |
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then | |
echo "β Failed to start workers. Check workers.log for errors." | |
cleanup | |
exit 1 | |
fi | |
# Check if workers are actually running by looking for their processes (updated for new --worker-address format) | |
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0") | |
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then | |
echo "β Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors." | |
cleanup | |
exit 1 | |
fi | |
echo "β Workers started successfully ($RUNNING_WORKERS workers running)" | |
echo "" | |
echo "π System is ready!" | |
echo "================================" | |
echo "π Web interface: http://localhost:$DISPATCHER_PORT" | |
echo "π Dispatcher health: http://localhost:$DISPATCHER_PORT" | |
echo "π§ Worker health checks:" | |
for ((i=0; i<NUM_GPUS; i++)); do | |
echo " GPU $i: http://localhost:$((8001 + i))/health" | |
done | |
echo "" | |
echo "π Log files:" | |
echo " π Analytics (human-readable): system_analytics_*.log" | |
echo " π₯οΈ GPU metrics (JSON): gpu_metrics_*.jsonl" | |
echo " π Connection events (JSON): connection_events_*.jsonl" | |
echo " π Queue metrics (JSON): queue_metrics_*.jsonl" | |
echo " π IP statistics (JSON): ip_stats_*.jsonl" | |
echo " π― Dispatcher: dispatcher.log" | |
echo " π§ Workers summary: workers.log" | |
for ((i=0; i<NUM_GPUS; i++)); do | |
echo " π₯οΈ GPU $i worker: worker_gpu_$i.log" | |
done | |
echo "" | |
echo "π‘ Real-time monitoring:" | |
echo " Human-readable: tail -f system_analytics_*.log" | |
echo " GPU utilization: tail -f gpu_metrics_*.jsonl" | |
echo " Connection events: tail -f connection_events_*.jsonl" | |
echo "" | |
echo "π Data analysis:" | |
echo " Summary report: python analyze_analytics.py" | |
echo " Last 6 hours: python analyze_analytics.py --since 6" | |
echo " GPU analysis only: python analyze_analytics.py --type gpu" | |
echo "Press Ctrl+C to stop the system" | |
echo "================================" | |
# Keep the script running and wait for interrupt | |
while true; do | |
# Check if processes are still running | |
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then | |
echo "β οΈ Dispatcher process died unexpectedly" | |
cleanup | |
exit 1 | |
fi | |
# Check if workers are still running | |
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0") | |
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then | |
echo "β οΈ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS" | |
echo "π System will continue operating with reduced capacity" | |
echo "π‘ Check worker logs for error details" | |
# Don't exit - keep system running with remaining workers | |
fi | |
sleep 5 | |
done |