Spaces:

orhir
/

EdgeCape

Running on Zero

File size: 17,262 Bytes

184241a

# Description: Script to run multiple experiments on runai
import re
import subprocess
import os
import argparse
import time
from prettytable import PrettyTable

class Bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


def pretty_table(dct):
    table = PrettyTable(['Job', 'Status'])
    for c in sorted(dct.keys()):
        table.add_row([c, dct[c]])
    print(table)


def init_parser():
    parser = argparse.ArgumentParser(prog="RUNAI SCRIPT")
    parser.add_argument('action', type=str, default=None, help='Train or Test', choices=['train', 'test', 'run'])
    parser.add_argument('--config_folder', type=str, default=None, help='Run all configs in folder')
    parser.add_argument('--config', type=str, default=None, help='Run all configs in folder')
    parser.add_argument('--name', type=str, default=None, help='prefix')
    parser.add_argument('--delete', action='store_true', help='Delete job')
    parser.add_argument('--delete_fail', action='store_true', help='Delete job')
    parser.add_argument('--delete_pending', action='store_true', help='Delete job')
    parser.add_argument('--log', action='store_true', help='Show logs')
    parser.add_argument('--delete_folder', action='store_true', help='Delete workdir folder')
    parser.add_argument('--permute_keypoints', action='store_true', help='Delete workdir folder')
    parser.add_argument('--dist', action='store_true', help='Distributed  Training')
    parser.add_argument('--find_best', action='store_true', help='Find best according to val')
    parser.add_argument('--results', action='store_true', help='Show Results')
    parser.add_argument('--no_base', action='store_true', help='Skip base models')
    parser.add_argument('--show_cmd', action='store_true', help='Show command')
    parser.add_argument('--large', action='store_true', help='Use large node')
    parser.add_argument('--eval_three', action='store_true', help='Evaluate on 3 ckpts')
    parser.add_argument('--pck', type=float, default=0.2, help='PCK threshold')
    parser.add_argument('--auc',  action='store_true', help='Evaluate AUC')
    parser.add_argument('--mpck',  action='store_true', help='Evaluate mPCK')
    parser.add_argument('--check_logs', action='store_true', help='check runai logs instead of workdir')
    parser.add_argument('--stat', action='store_true', help='check runai status')
    parser.add_argument('--CVPR24', action='store_true', help='run on CVPR24 legacy folder')
    parser.add_argument('--run_best_ckpt', action='store_true', help='run on CVPR24 legacy folder')
    parser.add_argument('--num_samples', type=int, default=32, help='PCK threshold')
    parser.add_argument('--ft_epochs', type=int, default=None, help='Num of FT epochs')
    parser.add_argument('--masking', type=float, default=None, help='Num of FT epochs')
    parser.add_argument('--masking_lamda', type=float, default=None, help='Num of FT epochs')

    return parser.parse_args()


def check_status(job_name):
    status = None
    status_command = f'runai describe job {job_name}'
    log = subprocess.run(status_command, shell=True, capture_output=True)
    log = log.stdout.decode('utf-8')
    pattern = r"Status:\s+(\w+)"
    match = re.search(pattern, log)
    if match:
        status = match.group(1)
    return status


def train_is_running(job_name, status=['Running', 'Pending', 'Failed']):
    run_status = check_status(job_name)
    for stat in status:
        if run_status == stat:
            print(f'{Bcolors.FAIL}{job_name} is {stat}{Bcolors.ENDC}')
            return True
    return False


def get_best_run(workdir_path, config, find_best):
    file_name = None
    ckpt_path = f'{workdir_path}/latest.pth'
    if find_best == 'best':
        local_path = f'work_dir_runai/{config.split(".")[0]}'
        if os.path.exists(local_path):
            file_names = [filename for filename in os.listdir(local_path) if filename.startswith("best_")]
            if len(file_names) > 0:
                file_name = file_names[0]
                ckpt_path = f'{workdir_path}/{file_name}'
    elif find_best == 'epoch_100':
        local_path = f'work_dir_runai/{config.split(".")[0]}'
        if os.path.exists(local_path):
            file_name = 'epoch_100.pth'
            if len(file_name) > 0:
                ckpt_path = f'{workdir_path}/{file_name}'
    return ckpt_path, file_name

def check_runai_logs(job_name):
    os_command = f'runai logs {job_name}'
    # status = subprocess.run(os_command, shell=True, capture_output=True)
    # status = status.decode('utf-8')
    status = subprocess.run(os_command, shell=True, capture_output=True, text=True)
    status = status.stdout
    return status


def get_run_name(config, args, run):
    run = run.replace('_', '-')
    lwr_config = config.lower()
    train_job_name = f'or-{lwr_config.split(".")[0].replace("_", "-")}'
    if len(train_job_name) > 60:
        renamed_config = name_abriviator(lwr_config)
        train_job_name = f'or-{renamed_config.split(".")[0].replace("_", "-")}'[:60]
    test_job_name = f'ev-{run}-{lwr_config.split(".")[0].replace("_", "-")}'
    if len(test_job_name) > 40:
        renamed_config = name_abriviator(lwr_config)
        test_job_name = f'ev-{run}-{renamed_config.split(".")[0].replace("_", "-")}'[:58]
    job_names = [train_job_name, test_job_name]
    for i in range(len(job_names)):
        if job_names[i].endswith('-'):
            job_names[i] = job_names[i][:-1]
        if args.name is not None:
            job_names[i] = f'{args.name}-{job_names[i]}'
    return job_names


def name_abriviator(name):
    replace_dict = {
        'encoder': 'enc',
        'decoder': 'dec',
        'look_twice': 'lt',
        'cross_category': 'cc',
        'max_hops': 'hops',
        'lamda': 'l',
        'symmetric': 'sym',
        'auxiliary': 'aux',
        'batch_size': 'bs',
    }
    for key, value in replace_dict.items():
        name = name.replace(key, value)
    return name


def check_skip(lwr_config, args):
    if args.no_base and 'base' in lwr_config:
        print(f'Skipping {Bcolors.OKCYAN}{lwr_config}{Bcolors.ENDC} - base model')
        return True
    # if not args.action == "train" and ('cross_category' in lwr_config or 'cross_cat' in lwr_config):
    #     print(
    #         f'Skipping {Bcolors.OKCYAN}{lwr_config}{Bcolors.ENDC} - test on cross_caregory, validation is the same as test')
    #     return True
    return False


def print_results(results):
    print(f'\n\n\n{Bcolors.OKGREEN}Scores{Bcolors.ENDC}')
    config_length = max(15, max(len(key) for key in results.keys()))
    config_column_width = config_length + 2
    print(f'| {"Config":<{config_column_width}} | {"Max Value":<11} | {"Latest Value":<13} | {"Best Value":<10} | {"Best Epoch":<10} |')
    print(f'|{"-" * (config_column_width + 2)}|{"-" * 13}|{"-" * 15}|{"-" * 13}|{"-" * 11}|')
    for config, val_dict in sorted(results.items()):
        config_print = config.split('/')[-1].replace('.py', '')
        other_results = val_dict.copy()
        del other_results['latest']
        best_key = max(other_results, key=other_results.get)
        latest_val = parse_result(val_dict['latest'], Bcolors.OKBLUE)
        best_val = parse_result(val_dict[best_key], Bcolors.HEADER)
        if val_dict['latest'] is None and val_dict[best_key] is None:
            max_val = f'{Bcolors.WARNING}No results{Bcolors.ENDC}'
        elif val_dict['latest'] is None:
            max_val = best_val
        elif val_dict[best_key] is None:
            max_val = latest_val
        else:
            max_val = latest_val if val_dict['latest'] > val_dict[best_key] else best_val
        # print as a table: config, max_val, latest_val, best_val
        print(f'| {config_print:<{config_column_width}} | {max_val:<20} | {latest_val:<22} | {best_val:<20} |{best_key:<10} |')

        # print(f'{config_print}: {round(max_val * 100, 2)}   '
        #       f'Latest: {latest_val}   {best_key}: {best_val}')


def parse_result(value, color):
    if value is None:
        return f'{Bcolors.WARNING}No results{Bcolors.ENDC}'
    else:
        return f'{color}{round(value * 100, 2)}{Bcolors.ENDC}'


def main():
    delay = 1
    args = init_parser()
    scores = {}
    stat = {}
    best_run = None
    if args.config_folder:
        configs = []
        # list all py files in folder and subfolders
        if '*' in args.config_folder:
            config_folder = args.config_folder.strip("'")
            parent_folder = os.path.relpath(os.path.join(config_folder, os.pardir))
            configs = [os.path.join(parent_folder, f) for f in os.listdir(parent_folder) if config_folder.split('*')[0] in os.path.join(parent_folder, f)]
        else:
            matched_folders = [args.config_folder]
            for matched_folder in matched_folders:
                for root, dirs, files in os.walk(matched_folder):
                    for file in files:
                        if file.endswith(".py"):
                            configs.append(os.path.join(root, file))
    else:
        configs = [args.config]
    print(f"{Bcolors.OKGREEN}Running {args.action} on {len(configs)} configs{Bcolors.ENDC}")
    if args.action == "test" and not args.eval_three and not args.find_best:
        runs = ['latest', 'best']
    elif args.eval_three:
        runs = ['latest', 'best', 'epoch_100']
    elif args.find_best:
        runs = ['best']
    else:
        runs = ['latest']
    for config_path in sorted(configs):
        for run in runs:
            config = config_path.split("/")[-2] + "_" + config_path.split("/")[-1].replace('_config', '')
            if args.CVPR24:
                workdir_path = f'/storage/orhir/capeformer_legacy/{config.split(".")[0]}'
            else:
                workdir_path = f'/storage/orhir/capeformer/{config.split(".")[0]}'
            local_workdir_path = f'work_dir_runai/{config.split(".")[0]}'
            lwr_config = config.lower()
            if check_skip(lwr_config, args):
                continue
            if args.action == "train" or args.action == "run":
                gpu = 4 if args.dist else 1
                resource = f' -g {gpu}'
            else:
                # resource = f' --gpu-memory 4G --cpu 2 --memory 4G'
                resource = f' -g 0.3'
            if args.large:
                resource += f' --node-pools blaufer'
            if args.stat:
                train_job_name, job_name = get_run_name(config, args, run)
                if args.action == "train" or args.action == "run":
                    job_name = train_job_name
                print(f'{"-" * 30 + Bcolors.OKCYAN + job_name + Bcolors.ENDC + "-" * 30}')
                status = check_status(job_name)
                stat[job_name] = status
                continue
            # else:
            #     resource += f' --node-pools faculty'
            if args.action == "train":
                job_name, _ = get_run_name(config, args, run)
                if args.dist:
                    py_command = (f'python -m torch.distributed.launch '
                                  f'--nproc_per_node={gpu} --master_port=29500 '
                                  f'train.py --gpus {gpu} --config {config_path} '
                                  f'--work-dir {workdir_path}  --autoscale-lr '
                                  f'--launcher pytorch')
                else:
                    py_command = (f'python train.py  '
                                  f' --config {config_path}'
                                  f' --work-dir {workdir_path}')
            elif args.action == "run":
                job_name, _ = get_run_name(config, args, run)
                if args.masking is not None:
                    masking_precent = int(args.masking * 100)
                    workdir_path = f'/storage/orhir/capeformer/CVPR25_ablation_mask_{masking_precent}'
                    job_name += f'-{masking_precent}'
                if args.masking_lamda:
                    workdir_path = f'/storage/orhir/capeformer/CVPR25_ablation_mask_lamda_{int(args.masking_lamda)}'
                    job_name += f'-lamda-{int(args.masking_lamda)}'
                py_command = (f'python run.py  '
                              f' --config {config_path}'
                              f' --work_dir {workdir_path}')
                if args.run_best_ckpt:
                    py_command += ' --best'
                    job_name += '-best'
                if args.ft_epochs:
                    py_command += f' --ft_epochs {args.ft_epochs}'
                if args.masking:
                    py_command += f' --masking_ratio {args.masking}'
                if args.masking_lamda:
                    py_command += f' --lamda_masking {args.masking_lamda}'
            else:
                train_job_name, job_name = get_run_name(config, args, run)
                ckpt_path, best_run = get_best_run(workdir_path, config, run)
                py_command = f'python test.py {config_path} {ckpt_path} --num_samples {args.num_samples}'
                if args.permute_keypoints:
                    py_command += ' --permute_keypoints'
                    job_name = (job_name + '-permute-keypoints')[:60]
            print(f'{"-" * 30 + Bcolors.OKCYAN + job_name + Bcolors.ENDC + "-" * 30}')
            if args.log:
                os_command = f'runai logs {job_name}'
            elif args.delete_fail:
                if not train_is_running(job_name, ['Failed', 'Error']):
                    print("Job not failed, skipping...")
                    continue
                os_command = f'runai delete job {job_name}'
            elif args.delete_pending:
                if not train_is_running(job_name, ['Pending']):
                    continue
                os_command = f'runai delete job {job_name}'
            elif args.delete:
                os_command = f'runai delete job {job_name}'
            elif args.results:
                if args.check_logs:
                    # First check if the job is completed
                    status = check_runai_logs(job_name)
                else:
                    if args.action == 'run':
                        log_file = os.path.join(f'work_dir_runai/{config.split(".")[0]}',
                                                'base_skeleton_bias',
                                                'testing_log.txt')
                    else:
                        log_file = os.path.join(f'work_dir_runai/{config.split(".")[0]}',
                                                'testing_log.txt')
                    if os.path.exists(log_file):
                        with open(log_file, 'r') as f:
                            status = f.read()
                        # Parse config:
                        match = re.search(f'\*\*[\s\S]*?checkpoint:\s*.*?{run}[\s\S]*?(AUC:[\s\S]*?mPCK:\s*[\d.]+)', status)
                        if match:
                            status = match.group(1)
                        else:
                            status = ''
                        delay = 0
                    else:
                        status = check_runai_logs(job_name)
                if args.auc and 'AUC' in status:
                    score = float(status.split('AUC: ')[1].split('\n')[0])
                elif args.mpck and 'mPCK' in status:
                    score = float(status.split('mPCK: ')[1].split('\n')[0])
                elif f'PCK@{args.pck}:' in status:
                    score = float(status.split(f'PCK@{args.pck}: ')[1].split('\n')[0])
                else:
                    score = None
                best_run = best_run.replace('best_PCK_', '').strip('.pth') if best_run else "No Best"
                key = 'latest' if run == 'latest' else best_run
                if config in scores:
                    scores[config][key] = score
                else:
                    scores[config] = {key: score}
                continue
            else:
                if args.action == 'test':
                    if not train_is_running(train_job_name, ['Completed', 'Succeeded']):
                        print('Train not completed')
                        continue
                os_command = (f'runai submit --pvc=storage:/storage -i orhir/capeformer '
                              f' --name {job_name} {resource} --large-shm '
                              f' --command -- {py_command}')
            # print(os_command)
            if args.show_cmd:
                print(f'{Bcolors.OKGREEN}{os_command}{Bcolors.ENDC}')
            subprocess.run(os_command, shell=True)
            if args.delete_folder:
                if os.path.exists(local_workdir_path):
                    subprocess.run(f'rm -rf {local_workdir_path}', shell=True)
                else:
                    subprocess.run(f'echo {Bcolors.WARNING}No workdir folder to delete{Bcolors.ENDC}', shell=True)
            # print(f'\n{"-" * 150}')
            time.sleep(delay)
    if args.results:
        print_results(scores)
    if args.stat:
        pretty_table(stat)

if __name__ == "__main__":
    main()