Vincentqyw
fix: roma
c74a070
raw
history blame
5.64 kB
import math
import argparse
import pprint
from distutils.util import strtobool
from pathlib import Path
from loguru import logger as loguru_logger
import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_only
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.plugins import DDPPlugin
from src.config.default import get_cfg_defaults
from src.utils.misc import get_rank_zero_only_logger, setup_gpus
from src.utils.profiler import build_profiler
from src.lightning.data import MultiSceneDataModule
from src.lightning.lightning_aspanformer import PL_ASpanFormer
loguru_logger = get_rank_zero_only_logger(loguru_logger)
def parse_args():
def str2bool(v):
return v.lower() in ("true", "1")
# init a costum parser which will be added into pl.Trainer parser
# check documentation: https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-flags
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("data_cfg_path", type=str, help="data config path")
parser.add_argument("main_cfg_path", type=str, help="main config path")
parser.add_argument("--exp_name", type=str, default="default_exp_name")
parser.add_argument("--batch_size", type=int, default=4, help="batch_size per gpu")
parser.add_argument("--num_workers", type=int, default=4)
parser.add_argument(
"--pin_memory",
type=lambda x: bool(strtobool(x)),
nargs="?",
default=True,
help="whether loading data to pinned memory or not",
)
parser.add_argument(
"--ckpt_path",
type=str,
default=None,
help="pretrained checkpoint path, helpful for using a pre-trained coarse-only ASpanFormer",
)
parser.add_argument(
"--disable_ckpt",
action="store_true",
help="disable checkpoint saving (useful for debugging).",
)
parser.add_argument(
"--profiler_name",
type=str,
default=None,
help="options: [inference, pytorch], or leave it unset",
)
parser.add_argument(
"--parallel_load_data",
action="store_true",
help="load datasets in with multiple processes.",
)
parser.add_argument(
"--mode",
type=str,
default="vanilla",
help="pretrained checkpoint path, helpful for using a pre-trained coarse-only ASpanFormer",
)
parser.add_argument(
"--ini",
type=str2bool,
default=False,
help="pretrained checkpoint path, helpful for using a pre-trained coarse-only ASpanFormer",
)
parser = pl.Trainer.add_argparse_args(parser)
return parser.parse_args()
def main():
# parse arguments
args = parse_args()
rank_zero_only(pprint.pprint)(vars(args))
# init default-cfg and merge it with the main- and data-cfg
config = get_cfg_defaults()
config.merge_from_file(args.main_cfg_path)
config.merge_from_file(args.data_cfg_path)
pl.seed_everything(config.TRAINER.SEED) # reproducibility
# TODO: Use different seeds for each dataloader workers
# This is needed for data augmentation
# scale lr and warmup-step automatically
args.gpus = _n_gpus = setup_gpus(args.gpus)
config.TRAINER.WORLD_SIZE = _n_gpus * args.num_nodes
config.TRAINER.TRUE_BATCH_SIZE = config.TRAINER.WORLD_SIZE * args.batch_size
_scaling = config.TRAINER.TRUE_BATCH_SIZE / config.TRAINER.CANONICAL_BS
config.TRAINER.SCALING = _scaling
config.TRAINER.TRUE_LR = config.TRAINER.CANONICAL_LR * _scaling
config.TRAINER.WARMUP_STEP = math.floor(config.TRAINER.WARMUP_STEP / _scaling)
# lightning module
profiler = build_profiler(args.profiler_name)
model = PL_ASpanFormer(config, pretrained_ckpt=args.ckpt_path, profiler=profiler)
loguru_logger.info(f"ASpanFormer LightningModule initialized!")
# lightning data
data_module = MultiSceneDataModule(args, config)
loguru_logger.info(f"ASpanFormer DataModule initialized!")
# TensorBoard Logger
logger = TensorBoardLogger(
save_dir="logs/tb_logs", name=args.exp_name, default_hp_metric=False
)
ckpt_dir = Path(logger.log_dir) / "checkpoints"
# Callbacks
# TODO: update ModelCheckpoint to monitor multiple metrics
ckpt_callback = ModelCheckpoint(
monitor="auc@10",
verbose=True,
save_top_k=5,
mode="max",
save_last=True,
dirpath=str(ckpt_dir),
filename="{epoch}-{auc@5:.3f}-{auc@10:.3f}-{auc@20:.3f}",
)
lr_monitor = LearningRateMonitor(logging_interval="step")
callbacks = [lr_monitor]
if not args.disable_ckpt:
callbacks.append(ckpt_callback)
# Lightning Trainer
trainer = pl.Trainer.from_argparse_args(
args,
plugins=DDPPlugin(
find_unused_parameters=False,
num_nodes=args.num_nodes,
sync_batchnorm=config.TRAINER.WORLD_SIZE > 0,
),
gradient_clip_val=config.TRAINER.GRADIENT_CLIPPING,
callbacks=callbacks,
logger=logger,
sync_batchnorm=config.TRAINER.WORLD_SIZE > 0,
replace_sampler_ddp=False, # use custom sampler
reload_dataloaders_every_epoch=False, # avoid repeated samples!
weights_summary="full",
profiler=profiler,
)
loguru_logger.info(f"Trainer initialized!")
loguru_logger.info(f"Start training!")
trainer.fit(model, datamodule=data_module)
if __name__ == "__main__":
main()