Spaces:
Sleeping
Sleeping
""" | |
Copyright (c) 2022, salesforce.com, inc. | |
All rights reserved. | |
SPDX-License-Identifier: BSD-3-Clause | |
For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause | |
""" | |
import argparse | |
import os | |
import random | |
import numpy as np | |
import torch | |
import torch.backends.cudnn as cudnn | |
import minigpt4.tasks as tasks | |
from minigpt4.common.config import Config | |
from minigpt4.common.dist_utils import get_rank, init_distributed_mode | |
from minigpt4.common.logger import setup_logger | |
from minigpt4.common.optims import ( | |
LinearWarmupCosineLRScheduler, | |
LinearWarmupStepLRScheduler, | |
) | |
from minigpt4.common.registry import registry | |
from minigpt4.common.utils import now | |
# imports modules for registration | |
from minigpt4.datasets.builders import * | |
from minigpt4.models import * | |
from minigpt4.processors import * | |
from minigpt4.runners import * | |
from minigpt4.tasks import * | |
import wandb | |
import torch.distributed as dist | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Training",add_help=False) | |
parser.add_argument("--cfg-path", required=True, help="path to configuration file.") | |
parser.add_argument( | |
"--options", | |
nargs="+" | |
) | |
parser.add_argument("--job_name",default="minigpt_spatial_coco_control",type=str) | |
# distributed training parameters | |
parser.add_argument('--world_size', default=1, type=int, | |
help='number of distributed processes') | |
parser.add_argument('--local_rank', default=-1, type=int) | |
parser.add_argument('--dist_on_itp', action='store_true') | |
parser.add_argument('--dist_url', default='env://', | |
help='url used to set up distributed training') | |
# args = parser.parse_args() | |
return parser | |
def setup_seeds(config): | |
seed = config.run_cfg.seed + get_rank() | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
cudnn.benchmark = False | |
cudnn.deterministic = True | |
def get_runner_class(cfg): | |
""" | |
Get runner class from config. Default to epoch-based runner. | |
""" | |
runner_cls = registry.get_runner_class(cfg.run_cfg.get("runner", "runner_base")) | |
return runner_cls | |
def main(): | |
# allow auto-dl completes on main process without timeout when using NCCL backend. | |
# os.environ["NCCL_BLOCKING_WAIT"] = "1" | |
# set before init_distributed_mode() to ensure the same job_id shared across all ranks. | |
print("start!!!") | |
job_id = now() | |
args = parse_args().parse_args() | |
print("0000") | |
cfg = Config(args) | |
if 'LOCAL_RANK' not in os.environ: | |
print("not in the os") | |
os.environ['LOCAL_RANK'] = str(args.local_rank) | |
print("111") | |
local_rank = int(os.environ.get('LOCAL_RANK', 0)) | |
torch.cuda.set_device(local_rank) | |
print("local rank",local_rank) | |
dist.init_process_group(backend='nccl', init_method='env://') | |
num_nodes = dist.get_world_size() | |
print(f"Number of nodes: {num_nodes}") | |
init_distributed_mode(cfg.run_cfg) | |
setup_seeds(cfg) | |
# set after in | |
# it_distributed_mode() to only log on master. | |
setup_logger() | |
wandb.login() | |
# print(wandb.run) | |
cfg.pretty_print() | |
task = tasks.setup_task(cfg) | |
datasets = task.build_datasets(cfg) | |
model = task.build_model(cfg) | |
if cfg.run_cfg.rank == 0: | |
print("project name", args.job_name) | |
wandb.init(project="minigpt4-spatial",name=args.job_name) | |
wandb.config = {"learning_rate": 0.0001, "epochs": 100, "batch_size": 8} | |
wandb.watch(model) | |
# print('+++++++++++++++++') | |
# print(type(model)) | |
# print('+++++++++++++++++') | |
# print(model) | |
# print('+++++++++++++++++') | |
# print(model.super().device) | |
# print('+++++++++++++++++') | |
# print(model.device) | |
runner = get_runner_class(cfg)( | |
cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets | |
) | |
runner.train() | |
if __name__ == "__main__": | |
main() | |