|
import math |
|
import argparse |
|
import pprint |
|
from distutils.util import strtobool |
|
from pathlib import Path |
|
from loguru import logger as loguru_logger |
|
|
|
import pytorch_lightning as pl |
|
from pytorch_lightning.utilities import rank_zero_only |
|
from pytorch_lightning.loggers import TensorBoardLogger |
|
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor |
|
from pytorch_lightning.plugins import DDPPlugin |
|
|
|
from src.config.default import get_cfg_defaults |
|
from src.utils.misc import get_rank_zero_only_logger, setup_gpus |
|
from src.utils.profiler import build_profiler |
|
from src.lightning.data import MultiSceneDataModule |
|
from src.lightning.lightning_aspanformer import PL_ASpanFormer |
|
|
|
loguru_logger = get_rank_zero_only_logger(loguru_logger) |
|
|
|
|
|
def parse_args(): |
|
def str2bool(v): |
|
return v.lower() in ("true", "1") |
|
|
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter |
|
) |
|
parser.add_argument("data_cfg_path", type=str, help="data config path") |
|
parser.add_argument("main_cfg_path", type=str, help="main config path") |
|
parser.add_argument("--exp_name", type=str, default="default_exp_name") |
|
parser.add_argument("--batch_size", type=int, default=4, help="batch_size per gpu") |
|
parser.add_argument("--num_workers", type=int, default=4) |
|
parser.add_argument( |
|
"--pin_memory", |
|
type=lambda x: bool(strtobool(x)), |
|
nargs="?", |
|
default=True, |
|
help="whether loading data to pinned memory or not", |
|
) |
|
parser.add_argument( |
|
"--ckpt_path", |
|
type=str, |
|
default=None, |
|
help="pretrained checkpoint path, helpful for using a pre-trained coarse-only ASpanFormer", |
|
) |
|
parser.add_argument( |
|
"--disable_ckpt", |
|
action="store_true", |
|
help="disable checkpoint saving (useful for debugging).", |
|
) |
|
parser.add_argument( |
|
"--profiler_name", |
|
type=str, |
|
default=None, |
|
help="options: [inference, pytorch], or leave it unset", |
|
) |
|
parser.add_argument( |
|
"--parallel_load_data", |
|
action="store_true", |
|
help="load datasets in with multiple processes.", |
|
) |
|
parser.add_argument( |
|
"--mode", |
|
type=str, |
|
default="vanilla", |
|
help="pretrained checkpoint path, helpful for using a pre-trained coarse-only ASpanFormer", |
|
) |
|
parser.add_argument( |
|
"--ini", |
|
type=str2bool, |
|
default=False, |
|
help="pretrained checkpoint path, helpful for using a pre-trained coarse-only ASpanFormer", |
|
) |
|
|
|
parser = pl.Trainer.add_argparse_args(parser) |
|
return parser.parse_args() |
|
|
|
|
|
def main(): |
|
|
|
args = parse_args() |
|
rank_zero_only(pprint.pprint)(vars(args)) |
|
|
|
|
|
config = get_cfg_defaults() |
|
config.merge_from_file(args.main_cfg_path) |
|
config.merge_from_file(args.data_cfg_path) |
|
pl.seed_everything(config.TRAINER.SEED) |
|
|
|
|
|
|
|
|
|
args.gpus = _n_gpus = setup_gpus(args.gpus) |
|
config.TRAINER.WORLD_SIZE = _n_gpus * args.num_nodes |
|
config.TRAINER.TRUE_BATCH_SIZE = config.TRAINER.WORLD_SIZE * args.batch_size |
|
_scaling = config.TRAINER.TRUE_BATCH_SIZE / config.TRAINER.CANONICAL_BS |
|
config.TRAINER.SCALING = _scaling |
|
config.TRAINER.TRUE_LR = config.TRAINER.CANONICAL_LR * _scaling |
|
config.TRAINER.WARMUP_STEP = math.floor(config.TRAINER.WARMUP_STEP / _scaling) |
|
|
|
|
|
profiler = build_profiler(args.profiler_name) |
|
model = PL_ASpanFormer(config, pretrained_ckpt=args.ckpt_path, profiler=profiler) |
|
loguru_logger.info(f"ASpanFormer LightningModule initialized!") |
|
|
|
|
|
data_module = MultiSceneDataModule(args, config) |
|
loguru_logger.info(f"ASpanFormer DataModule initialized!") |
|
|
|
|
|
logger = TensorBoardLogger( |
|
save_dir="logs/tb_logs", name=args.exp_name, default_hp_metric=False |
|
) |
|
ckpt_dir = Path(logger.log_dir) / "checkpoints" |
|
|
|
|
|
|
|
ckpt_callback = ModelCheckpoint( |
|
monitor="auc@10", |
|
verbose=True, |
|
save_top_k=5, |
|
mode="max", |
|
save_last=True, |
|
dirpath=str(ckpt_dir), |
|
filename="{epoch}-{auc@5:.3f}-{auc@10:.3f}-{auc@20:.3f}", |
|
) |
|
lr_monitor = LearningRateMonitor(logging_interval="step") |
|
callbacks = [lr_monitor] |
|
if not args.disable_ckpt: |
|
callbacks.append(ckpt_callback) |
|
|
|
|
|
trainer = pl.Trainer.from_argparse_args( |
|
args, |
|
plugins=DDPPlugin( |
|
find_unused_parameters=False, |
|
num_nodes=args.num_nodes, |
|
sync_batchnorm=config.TRAINER.WORLD_SIZE > 0, |
|
), |
|
gradient_clip_val=config.TRAINER.GRADIENT_CLIPPING, |
|
callbacks=callbacks, |
|
logger=logger, |
|
sync_batchnorm=config.TRAINER.WORLD_SIZE > 0, |
|
replace_sampler_ddp=False, |
|
reload_dataloaders_every_epoch=False, |
|
weights_summary="full", |
|
profiler=profiler, |
|
) |
|
loguru_logger.info(f"Trainer initialized!") |
|
loguru_logger.info(f"Start training!") |
|
trainer.fit(model, datamodule=data_module) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|