File size: 3,654 Bytes
99a05f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import sys
import stat
import shutil
import subprocess

from loguru import logger

GPUS = {
    'v100-v16': ('\"Tesla V100-PCIE-16GB\"', 'tesla', 16000),
    'v100-p32': ('\"Tesla V100-PCIE-32GB\"', 'tesla', 32000),
    'v100-s32': ('\"Tesla V100-SXM2-32GB\"', 'tesla', 32000),
    'v100-p16': ('\"Tesla P100-PCIE-16GB\"', 'tesla', 16000),
}

def get_gpus(min_mem=10000, arch=('tesla', 'quadro', 'rtx')):
    gpu_names = []
    for k, (gpu_name, gpu_arch, gpu_mem) in GPUS.items():
        if gpu_mem >= min_mem and gpu_arch in arch:
            gpu_names.append(gpu_name)

    assert len(gpu_names) > 0, 'Suitable GPU model could not be found'

    return gpu_names


def execute_task_on_cluster(
        script,
        exp_name,
        output_dir,
        condor_dir,
        cfg_file,
        num_exp=1,
        exp_opts=None,
        bid_amount=10,
        num_workers=2,
        memory=64000,
        gpu_min_mem=10000,
        gpu_arch=('tesla', 'quadro', 'rtx'),
        num_gpus=1
):
    # copy config to a new experiment directory and source from there.
    # this makes sure the correct config is copied even if you change the config file
    # after starting the experiment and before the first job is submitted
    temp_config_dir = os.path.join(os.path.dirname(condor_dir), 'temp_configs', exp_name)
    os.makedirs(temp_config_dir, exist_ok=True)
    new_cfg_file = os.path.join(temp_config_dir, 'config.yaml')
    shutil.copy(src=cfg_file, dst=new_cfg_file)

    gpus = get_gpus(min_mem=gpu_min_mem, arch=gpu_arch)

    gpus = ' || '.join([f'CUDADeviceName=={x}' for x in gpus])

    condor_log_dir = os.path.join(condor_dir, 'condorlog', exp_name)
    os.makedirs(condor_log_dir, exist_ok=True)
    submission = f'executable = {condor_log_dir}/{exp_name}_run.sh\n' \
                 'arguments = $(Process) $(Cluster)\n' \
                 f'error = {condor_log_dir}/{exp_name}_$(Cluster).$(Process).err\n' \
                 f'output = {condor_log_dir}/{exp_name}_$(Cluster).$(Process).out\n' \
                 f'log = {condor_log_dir}/{exp_name}_$(Cluster).$(Process).log\n' \
                 f'request_memory = {memory}\n' \
                 f'request_cpus={int(num_workers)}\n' \
                 f'request_gpus={num_gpus}\n' \
                 f'requirements={gpus}\n' \
                 f'+MaxRunningPrice = 500\n' \
                 f'queue {num_exp}'
                 # f'request_cpus={int(num_workers/2)}\n' \
                 # f'+RunningPriceExceededAction = \"kill\"\n' \
    print('<<< Condor Submission >>> ')
    print(submission)

    with open(f'{condor_log_dir}/{exp_name}_submit.sub', 'w') as f:
        f.write(submission)

    # output_dir = os.path.join(output_dir, exp_name)
    logger.info(f'The logs for this experiments can be found under: {condor_log_dir}')
    logger.info(f'The outputs for this experiments can be found under: {output_dir}')
    ## This is the trick. Notice there is no --cluster here
    bash = 'export PYTHONBUFFERED=1\n export PATH=$PATH\n ' \
           f'{sys.executable} {script} --cfg {new_cfg_file} --cfg_id $1'

    if exp_opts is not None:
        bash += ' --opts '
        for opt in exp_opts:
            bash += f'{opt} '
        bash += 'SYSTEM.CLUSTER_NODE $2.$1'
    else:
        bash += ' --opts SYSTEM.CLUSTER_NODE $2.$1'

    executable_path = f'{condor_log_dir}/{exp_name}_run.sh'

    with open(executable_path, 'w') as f:
        f.write(bash)

    os.chmod(executable_path, stat.S_IRWXU)

    cmd = ['condor_submit_bid', f'{bid_amount}', f'{condor_log_dir}/{exp_name}_submit.sub']
    logger.info('Executing ' + ' '.join(cmd))
    subprocess.call(cmd)