File size: 5,637 Bytes
bb690a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3720a59
bb690a1
 
 
 
 
 
 
 
 
 
 
3720a59
8963c43
bb690a1
 
3720a59
 
 
 
 
564624d
3720a59
 
 
bb690a1
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

update_device_command = "update_device_idx;\n"

update_device_func = """
function update_device_idx {
    if [ $constrain_total = true ]; then
    # check total cpu usage
    while true; do
        cpu_mean_1=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }')
        sleep 1
        cpu_mean_2=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }')
        sleep 1
        cpu_mean_3=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }')
        cpu_mean=$(echo "scale=2; ($cpu_mean_1+$cpu_mean_2+$cpu_mean_3)/3" | bc)

        # if currently cpu usage is less than the threshold, then break
        if [ $(echo "$cpu_mean < $cpu_mean_max" | bc) -eq 1 ]; then
            echo "total cpu mean: $cpu_mean is less than $cpu_mean_max, continue to check total memory usage"
            break
        else
            echo "total cpu mean: $cpu_mean is greater than $cpu_mean_max, sleep 10 seconds"
            sleep 10
        fi
    done;

    # check total memory usage
    while true; do
        # get memory usage of whole system
        mem_used_1=$(free -m | awk '/Mem:/ {print $3}')
        sleep 1
        mem_used_2=$(free -m | awk '/Mem:/ {print $3}')
        sleep 1
        mem_used_3=$(free -m | awk '/Mem:/ {print $3}')
        mem_used=$(echo "scale=2; ($mem_used_1+$mem_used_2+$mem_used_3)/3" | bc)
        
        # echo $mem_used
        # get rate of memory usage
        mem_rate=$(echo "scale=2; $mem_used/$(free -m | awk '/Mem:/ {print $2}')*100" | bc)
        # echo $mem_rate
        if [ $(echo "$mem_rate < $memory_rate_max" | bc) -eq 1 ]; then
            echo "total memory rate: $mem_rate is less than $memory_rate_max, continue to check my own cpu and memory usage"
            break
        else
            echo "total memory rate: $mem_rate is greater than $memory_rate_max, sleep 10 seconds"
            sleep 10
        fi
    done;
    fi;

    # if constrain_mine
    if [ $constrain_mine = true ]; then

        # check my own cpu and memory usage, it should be less than 1/$constrain_rate of the given cpu_mean_max / memory_rate_max
        while true; do
            username=$username_mine
            cpu_usage_user_sum=$(ps -u $username -o %cpu | awk '{sum+=$1} END {print sum}')
            # echo $cpu_usage_user_sum
            total_aviable_cpu=$(nproc)
            total_aviable_cpu=$(echo "$total_aviable_cpu*100" | bc)
            # echo $total_aviable_cpu
            cpu_usage_user_ratio=$(echo "scale=2; $cpu_usage_user_sum/$total_aviable_cpu*100" | bc)
            # echo $cpu_usage_user_ratio

            memory_usage_user_sum=$(ps -u $username -o rss | awk '{sum+=$1} END {print sum/1024}')
            # echo $memory_usage_user_sum
            memory_usage_total=$(free -m | awk '/Mem:/ {print $2}')
            # echo $memory_usage_total
            memory_usage_user_ratio=$(echo "scale=2; $memory_usage_user_sum/$memory_usage_total*100" | bc)
            # echo $memory_usage_user_ratio

            # so my ratio should be less than 1/$constrain_rate of the given threshold
            cpu_mean_max_mine=$(echo "$cpu_mean_max/$constrain_rate" | bc)
            memory_rate_max_mine=$(echo "$memory_rate_max/$constrain_rate" | bc)
            if [ $(echo "$cpu_usage_user_ratio < $cpu_mean_max_mine" | bc) -eq 1 ] && [ $(echo "$memory_usage_user_ratio < $memory_rate_max_mine" | bc) -eq 1 ]; then
                echo "my cpu usage: $cpu_usage_user_ratio, memory usage: $memory_usage_user_ratio is less than half of the given threshold for cpu: $cpu_mean_max_mine and memory: $memory_rate_max_mine, ready to take off"
                break
            else
                echo "my cpu usage: $cpu_usage_user_ratio, memory usage: $memory_usage_user_ratio is greater than half of the given threshold, sleep 10 seconds"
                sleep 10
            fi
        done;
    fi;

    # so all the conditions are satisfied, we can update the device idx and run the next experiment
    cnt_longer_sleep=0
    while true; do
        current_device_idx=$((current_device_idx+1))
        if [ $current_device_idx -ge ${#available_devices[@]} ]; then
            # reset 
            current_device_idx=0
        fi
        # check whether this device is fully booked using nvidia-smi
        # get the gpu current memory usage 
        useage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]})
        utilization=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]})
        
        
        if [ $useage -ge $((total_gpu_memory-max_gpu_memory_gap)) ] || [ $utilization -ge $max_gpu_utilization ]; then
            echo "device ${available_devices[$current_device_idx]} is fully booked, try next one"
            sleep 3
            
            # when cnt_longer_sleep mod $gpu_num == 0, we sleep longer
            cnt_longer_sleep=$((cnt_longer_sleep+1))
            cnt_longer_sleep=$(echo "$cnt_longer_sleep%${#available_devices[@]}" | bc)
            if [ $cnt_longer_sleep -eq 0 ]; then
                echo "sleep 60 seconds since all devices are fully booked"
                sleep 60
            fi
            
            continue
        else
            break
        fi
    done
    echo "current device: ${available_devices[$current_device_idx]}"
    device=${available_devices[$current_device_idx]}
}
"""