Spaces:
Running
Running
import math | |
import argparse | |
import pprint | |
from distutils.util import strtobool | |
from pathlib import Path | |
from loguru import logger as loguru_logger | |
import pytorch_lightning as pl | |
from pytorch_lightning.utilities import rank_zero_only | |
from pytorch_lightning.loggers import TensorBoardLogger | |
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor | |
from pytorch_lightning.plugins import DDPPlugin | |
from src.config.default import get_cfg_defaults | |
from src.utils.misc import get_rank_zero_only_logger, setup_gpus | |
from src.utils.profiler import build_profiler | |
from src.lightning_trainer.data import MultiSceneDataModule | |
from src.lightning_trainer.trainer import PL_Trainer | |
loguru_logger = get_rank_zero_only_logger(loguru_logger) | |
def parse_args(): | |
# init a costum parser which will be added into pl.Trainer parser | |
# check documentation: https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-flags | |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument( | |
'data_cfg_path', type=str, help='data config path') | |
parser.add_argument( | |
'main_cfg_path', type=str, help='main config path') | |
parser.add_argument( | |
'--exp_name', type=str, default='default_exp_name') | |
parser.add_argument( | |
'--batch_size', type=int, default=4, help='batch_size per gpu') | |
parser.add_argument( | |
'--num_workers', type=int, default=4) | |
parser.add_argument( | |
'--pin_memory', type=lambda x: bool(strtobool(x)), | |
nargs='?', default=True, help='whether loading data to pinned memory or not') | |
parser.add_argument( | |
'--ckpt_path', type=str, default=None, | |
help='pretrained checkpoint path, helpful for using a pre-trained coarse-only LoFTR') | |
parser.add_argument( | |
'--disable_ckpt', action='store_true', | |
help='disable checkpoint saving (useful for debugging).') | |
parser.add_argument( | |
'--profiler_name', type=str, default=None, | |
help='options: [inference, pytorch], or leave it unset') | |
parser.add_argument( | |
'--parallel_load_data', action='store_true', | |
help='load datasets in with multiple processes.') | |
parser = pl.Trainer.add_argparse_args(parser) | |
return parser.parse_args() | |
def main(): | |
# parse arguments | |
args = parse_args() | |
rank_zero_only(pprint.pprint)(vars(args)) | |
# init default-cfg and merge it with the main- and data-cfg | |
config = get_cfg_defaults() | |
config.merge_from_file(args.main_cfg_path) | |
config.merge_from_file(args.data_cfg_path) | |
pl.seed_everything(config.TRAINER.SEED) # reproducibility | |
# TODO: Use different seeds for each dataloader workers | |
# This is needed for data augmentation | |
# scale lr and warmup-step automatically | |
args.gpus = _n_gpus = setup_gpus(args.gpus) | |
config.TRAINER.WORLD_SIZE = _n_gpus * args.num_nodes | |
config.TRAINER.TRUE_BATCH_SIZE = config.TRAINER.WORLD_SIZE * args.batch_size | |
_scaling = config.TRAINER.TRUE_BATCH_SIZE / config.TRAINER.CANONICAL_BS | |
config.TRAINER.SCALING = _scaling | |
config.TRAINER.TRUE_LR = config.TRAINER.CANONICAL_LR * _scaling | |
config.TRAINER.WARMUP_STEP = math.floor(config.TRAINER.WARMUP_STEP / _scaling) | |
# lightning module | |
profiler = build_profiler(args.profiler_name) | |
model = PL_Trainer(config, pretrained_ckpt=args.ckpt_path, profiler=profiler) | |
loguru_logger.info(f"Model LightningModule initialized!") | |
# lightning data | |
data_module = MultiSceneDataModule(args, config) | |
loguru_logger.info(f"Model DataModule initialized!") | |
# TensorBoard Logger | |
logger = TensorBoardLogger(save_dir='logs/tb_logs', name=args.exp_name, default_hp_metric=False) | |
ckpt_dir = Path(logger.log_dir) / 'checkpoints' | |
# Callbacks | |
# TODO: update ModelCheckpoint to monitor multiple metrics | |
ckpt_callback = ModelCheckpoint(monitor='auc@10', verbose=True, save_top_k=5, mode='max', | |
save_last=True, | |
dirpath=str(ckpt_dir), | |
filename='{epoch}-{auc@5:.3f}-{auc@10:.3f}-{auc@20:.3f}') | |
lr_monitor = LearningRateMonitor(logging_interval='step') | |
callbacks = [lr_monitor] | |
if not args.disable_ckpt: | |
callbacks.append(ckpt_callback) | |
# Lightning Trainer | |
trainer = pl.Trainer.from_argparse_args( | |
args, | |
plugins=DDPPlugin(find_unused_parameters=False, | |
num_nodes=args.num_nodes, | |
sync_batchnorm=config.TRAINER.WORLD_SIZE > 0), | |
gradient_clip_val=config.TRAINER.GRADIENT_CLIPPING, | |
callbacks=callbacks, | |
logger=logger, | |
sync_batchnorm=config.TRAINER.WORLD_SIZE > 0, | |
replace_sampler_ddp=False, # use custom sampler | |
reload_dataloaders_every_epoch=False, # avoid repeated samples! | |
weights_summary='full', | |
profiler=profiler) | |
loguru_logger.info(f"Trainer initialized!") | |
loguru_logger.info(f"Start training!") | |
trainer.fit(model, datamodule=data_module) | |
if __name__ == '__main__': | |
main() | |