MiniGPT4-video / train_multinode.py
fffiloni's picture
Upload 164 files
2ada650 verified
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import argparse
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import minigpt4.tasks as tasks
from minigpt4.common.config import Config
from minigpt4.common.dist_utils import get_rank, init_distributed_mode
from minigpt4.common.logger import setup_logger
from minigpt4.common.optims import (
LinearWarmupCosineLRScheduler,
LinearWarmupStepLRScheduler,
)
from minigpt4.common.registry import registry
from minigpt4.common.utils import now
# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *
import wandb
import torch.distributed as dist
def parse_args():
parser = argparse.ArgumentParser(description="Training",add_help=False)
parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
parser.add_argument(
"--options",
nargs="+"
)
parser.add_argument("--job_name",default="minigpt_spatial_coco_control",type=str)
# distributed training parameters
parser.add_argument('--world_size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--local_rank', default=-1, type=int)
parser.add_argument('--dist_on_itp', action='store_true')
parser.add_argument('--dist_url', default='env://',
help='url used to set up distributed training')
# args = parser.parse_args()
return parser
def setup_seeds(config):
seed = config.run_cfg.seed + get_rank()
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
def get_runner_class(cfg):
"""
Get runner class from config. Default to epoch-based runner.
"""
runner_cls = registry.get_runner_class(cfg.run_cfg.get("runner", "runner_base"))
return runner_cls
def main():
# allow auto-dl completes on main process without timeout when using NCCL backend.
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
# set before init_distributed_mode() to ensure the same job_id shared across all ranks.
print("start!!!")
job_id = now()
args = parse_args().parse_args()
print("0000")
cfg = Config(args)
if 'LOCAL_RANK' not in os.environ:
print("not in the os")
os.environ['LOCAL_RANK'] = str(args.local_rank)
print("111")
local_rank = int(os.environ.get('LOCAL_RANK', 0))
torch.cuda.set_device(local_rank)
print("local rank",local_rank)
dist.init_process_group(backend='nccl', init_method='env://')
num_nodes = dist.get_world_size()
print(f"Number of nodes: {num_nodes}")
init_distributed_mode(cfg.run_cfg)
setup_seeds(cfg)
# set after in
# it_distributed_mode() to only log on master.
setup_logger()
wandb.login()
# print(wandb.run)
cfg.pretty_print()
task = tasks.setup_task(cfg)
datasets = task.build_datasets(cfg)
model = task.build_model(cfg)
if cfg.run_cfg.rank == 0:
print("project name", args.job_name)
wandb.init(project="minigpt4-spatial",name=args.job_name)
wandb.config = {"learning_rate": 0.0001, "epochs": 100, "batch_size": 8}
wandb.watch(model)
# print('+++++++++++++++++')
# print(type(model))
# print('+++++++++++++++++')
# print(model)
# print('+++++++++++++++++')
# print(model.super().device)
# print('+++++++++++++++++')
# print(model.device)
runner = get_runner_class(cfg)(
cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
)
runner.train()
if __name__ == "__main__":
main()