Spaces:
Running
Running
File size: 391 Bytes
7af4264 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
CONFIG=$1
GPUS=$2
MODEL_NAME=$(basename "$(dirname $CONFIG)")
PORT=10902
while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus
do
torchrun --nproc_per_node=$GPUS \
--master_port=$PORT \
train.py --c $CONFIG --model $MODEL_NAME
for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}')
do
echo $PID
kill -9 $PID
done
sleep 30
done |