Matthew
initial commit
0392181
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Grid features pre-training script.
This script is a simplified version of the training script in detectron2/tools.
"""
import os
import time
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
from detectron2.evaluation import COCOEvaluator, DatasetEvaluators, verify_results
from grid_feats import (
add_attribute_config,
build_detection_train_loader_with_attributes,
build_detection_test_loader_with_attributes,
)
class Trainer(DefaultTrainer):
"""
A trainer for visual genome dataset.
"""
def __init__(self, cfg):
super().__init__(cfg)
self.rpn_box_lw = cfg.MODEL.RPN.BBOX_LOSS_WEIGHT
self.rcnn_box_lw = cfg.MODEL.ROI_BOX_HEAD.BBOX_LOSS_WEIGHT
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
evaluator_list = []
evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
if evaluator_type == "coco":
return COCOEvaluator(dataset_name, cfg, True, output_folder)
if len(evaluator_list) == 0:
raise NotImplementedError(
"no Evaluator for the dataset {} with the type {}".format(
dataset_name, evaluator_type
)
)
if len(evaluator_list) == 1:
return evaluator_list[0]
return DatasetEvaluators(evaluator_list)
@classmethod
def build_train_loader(cls, cfg):
return build_detection_train_loader_with_attributes(cfg)
@classmethod
def build_test_loader(cls, cfg, dataset_name):
return build_detection_test_loader_with_attributes(cfg, dataset_name)
def run_step(self):
"""
!!Hack!! for the run_step method in SimpleTrainer to adjust the loss
"""
assert self.model.training, "[Trainer] model was changed to eval mode!"
start = time.perf_counter()
data = next(self._data_loader_iter)
data_time = time.perf_counter() - start
loss_dict = self.model(data)
# RPN box loss:
loss_dict["loss_rpn_loc"] *= self.rpn_box_lw
# R-CNN box loss:
loss_dict["loss_box_reg"] *= self.rcnn_box_lw
losses = sum(loss_dict.values())
self._detect_anomaly(losses, loss_dict)
metrics_dict = loss_dict
metrics_dict["data_time"] = data_time
self._write_metrics(metrics_dict)
self.optimizer.zero_grad()
losses.backward()
self.optimizer.step()
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
add_attribute_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
return cfg
def main(args):
cfg = setup(args)
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)