Spaces:
Runtime error
Runtime error
Yixin Liu
commited on
Commit
•
3720a59
1
Parent(s):
8963c43
upload
Browse files
config.py
CHANGED
@@ -82,6 +82,7 @@ function update_device_idx {
|
|
82 |
fi;
|
83 |
|
84 |
# so all the conditions are satisfied, we can update the device idx and run the next experiment
|
|
|
85 |
while true; do
|
86 |
current_device_idx=$((current_device_idx+1))
|
87 |
if [ $current_device_idx -ge ${#available_devices[@]} ]; then
|
@@ -93,9 +94,18 @@ function update_device_idx {
|
|
93 |
useage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]})
|
94 |
utilization=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]})
|
95 |
|
|
|
96 |
if [ $useage -ge $((total_gpu_memory-max_gpu_memory_gap)) ] || [ $utilization -ge $max_gpu_utilization ]; then
|
97 |
echo "device ${available_devices[$current_device_idx]} is fully booked, try next one"
|
98 |
sleep 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
continue
|
100 |
else
|
101 |
break
|
|
|
82 |
fi;
|
83 |
|
84 |
# so all the conditions are satisfied, we can update the device idx and run the next experiment
|
85 |
+
cnt_longer_sleep=0
|
86 |
while true; do
|
87 |
current_device_idx=$((current_device_idx+1))
|
88 |
if [ $current_device_idx -ge ${#available_devices[@]} ]; then
|
|
|
94 |
useage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]})
|
95 |
utilization=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]})
|
96 |
|
97 |
+
|
98 |
if [ $useage -ge $((total_gpu_memory-max_gpu_memory_gap)) ] || [ $utilization -ge $max_gpu_utilization ]; then
|
99 |
echo "device ${available_devices[$current_device_idx]} is fully booked, try next one"
|
100 |
sleep 3
|
101 |
+
|
102 |
+
# when cnt_longer_sleep mod $gpu_num == 0, we sleep longer
|
103 |
+
cnt_longer_sleep=$((cnt_longer_sleep+1))
|
104 |
+
cnt_longer_sleep=$(echo "$cnt_longer_sleep%${#available_devices[@]}" | bc)
|
105 |
+
if [ $cnt_longer_sleep -eq 0 ]; then
|
106 |
+
sleep 60
|
107 |
+
fi
|
108 |
+
|
109 |
continue
|
110 |
else
|
111 |
break
|