|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""This file contains code to create run an experiment.""" |
|
import functools |
|
import os |
|
from typing import Text, Optional, Sequence |
|
|
|
from absl import logging |
|
import orbit |
|
import tensorflow as tf |
|
|
|
from deeplab2 import common |
|
from deeplab2 import config_pb2 |
|
from deeplab2.data import dataset |
|
from deeplab2.model import deeplab |
|
from deeplab2.model.loss import loss_builder |
|
from deeplab2.trainer import distribution_utils |
|
from deeplab2.trainer import evaluator as evaluator_lib |
|
from deeplab2.trainer import runner_utils |
|
from deeplab2.trainer import trainer as trainer_lib |
|
from deeplab2.video import motion_deeplab |
|
from deeplab2.video import vip_deeplab |
|
|
|
_INSTANCE_LAYER_NAMES = (common.CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER, |
|
common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER, |
|
common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER) |
|
|
|
|
|
|
|
_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS = 6 |
|
|
|
_SINGLE_FRAME_INPUT_CHANNELS = 3 |
|
|
|
|
|
def create_deeplab_model( |
|
config: config_pb2.ExperimentOptions, |
|
dataset_descriptor: dataset.DatasetDescriptor) -> tf.keras.Model: |
|
"""Creates DeepLab model based on config.""" |
|
if config.model_options.WhichOneof('meta_architecture') == 'motion_deeplab': |
|
return motion_deeplab.MotionDeepLab(config, dataset_descriptor) |
|
elif config.model_options.WhichOneof('meta_architecture') == 'vip_deeplab': |
|
return vip_deeplab.ViPDeepLab(config, dataset_descriptor) |
|
else: |
|
return deeplab.DeepLab(config, dataset_descriptor) |
|
|
|
|
|
def build_deeplab_model(deeplab_model: tf.keras.Model, |
|
crop_size: Sequence[int], |
|
batch_size: Optional[int] = None): |
|
"""Builds DeepLab model with input crop size.""" |
|
if isinstance(deeplab_model, motion_deeplab.MotionDeepLab) or isinstance( |
|
deeplab_model, vip_deeplab.ViPDeepLab): |
|
|
|
|
|
|
|
|
|
input_shape = list(crop_size) + [_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS] |
|
deeplab_model( |
|
tf.keras.Input(input_shape, batch_size=batch_size), training=False) |
|
else: |
|
input_shape = list(crop_size) + [_SINGLE_FRAME_INPUT_CHANNELS] |
|
deeplab_model( |
|
tf.keras.Input(input_shape, batch_size=batch_size), training=False) |
|
return input_shape |
|
|
|
|
|
def run_experiment(mode: Text, config: config_pb2.ExperimentOptions, |
|
model_dir: Text, tpu: Optional[Text], num_gpus: int): |
|
"""Runs an experiment. |
|
|
|
Args: |
|
mode: A string specifying the mode of the experiment. Supported are `train`, |
|
`train_and_eval`, `eval` and `continuous_eval`. |
|
config: A config_pb2.ExperimentOptions configuration. |
|
model_dir: A path to store all checkpoints and other experimental artifacts. |
|
tpu: The name or address of the tpu to connect to, if any. |
|
num_gpus: An integer specifying the number of GPUs to use. If mode contains |
|
`eval`, num_gpus must be less or equal to 1. |
|
|
|
Raises: |
|
ValueError: If mode is none of `train`, `train_and_eval`, `eval`, or |
|
`continuous_eval`. |
|
ValueError: If mode is `train_and_eval`, but different dataset_names are |
|
specified for training and evaluation. This error could be relaxed for |
|
applications like domain transferring learning (e.g., synthetic to real |
|
datasets), which has not been fully tested yet. |
|
ValueError: If mode includes `eval` and num_gpus > 1. Currently, evaluation |
|
is not supported on more than a single GPU. |
|
""" |
|
strategy = distribution_utils.create_strategy(tpu, num_gpus) |
|
logging.info('Using strategy %s with %d replicas', type(strategy), |
|
strategy.num_replicas_in_sync) |
|
|
|
if 'eval' in mode: |
|
dataset_name = config.eval_dataset_options.dataset |
|
if (mode == 'train_and_eval' and |
|
dataset_name != config.train_dataset_options.dataset): |
|
raise ValueError('Using difference dataset_names in train_and_eval mode.') |
|
if num_gpus > 1: |
|
raise ValueError( |
|
'Using more than one GPU for evaluation is not supported.') |
|
else: |
|
dataset_name = config.train_dataset_options.dataset |
|
|
|
num_classes = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].num_classes |
|
ignore_label = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].ignore_label |
|
class_has_instances_list = ( |
|
dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].class_has_instances_list) |
|
|
|
trainer = None |
|
evaluator = None |
|
with strategy.scope(): |
|
deeplab_model = create_deeplab_model( |
|
config, dataset.MAP_NAME_TO_DATASET_INFO[dataset_name]) |
|
losses = loss_builder.DeepLabFamilyLoss(config.trainer_options.loss_options, |
|
num_classes, ignore_label, |
|
class_has_instances_list) |
|
global_step = orbit.utils.create_global_step() |
|
if 'train' in mode: |
|
trainer = trainer_lib.Trainer(config, deeplab_model, losses, global_step) |
|
if 'eval' in mode: |
|
evaluator = evaluator_lib.Evaluator(config, deeplab_model, losses, |
|
global_step, model_dir) |
|
|
|
checkpoint_dict = dict(global_step=global_step) |
|
checkpoint_dict.update(deeplab_model.checkpoint_items) |
|
if trainer is not None: |
|
checkpoint_dict['optimizer'] = trainer.optimizer |
|
if trainer.backbone_optimizer is not None: |
|
checkpoint_dict['backbone_optimizer'] = trainer.backbone_optimizer |
|
checkpoint = tf.train.Checkpoint(**checkpoint_dict) |
|
|
|
|
|
init_dict = deeplab_model.checkpoint_items |
|
if (not config.model_options |
|
.restore_semantic_last_layer_from_initial_checkpoint): |
|
del init_dict[common.CKPT_SEMANTIC_LAST_LAYER] |
|
if (not config.model_options |
|
.restore_instance_last_layer_from_initial_checkpoint): |
|
for layer_name in _INSTANCE_LAYER_NAMES: |
|
if layer_name in init_dict: |
|
del init_dict[layer_name] |
|
init_fn = functools.partial(runner_utils.maybe_load_checkpoint, |
|
config.model_options.initial_checkpoint, |
|
init_dict) |
|
checkpoint_manager = tf.train.CheckpointManager( |
|
checkpoint, |
|
directory=model_dir, |
|
max_to_keep=config.trainer_options.num_checkpoints_to_keep, |
|
step_counter=global_step, |
|
checkpoint_interval=config.trainer_options.save_checkpoints_steps, |
|
init_fn=init_fn) |
|
|
|
controller = orbit.Controller( |
|
strategy=strategy, |
|
trainer=trainer, |
|
evaluator=evaluator, |
|
global_step=global_step, |
|
steps_per_loop=config.trainer_options.steps_per_loop, |
|
checkpoint_manager=checkpoint_manager, |
|
summary_interval=config.trainer_options.save_summaries_steps, |
|
summary_dir=os.path.join(model_dir, 'train'), |
|
eval_summary_dir=os.path.join(model_dir, 'eval')) |
|
|
|
with strategy.scope(): |
|
|
|
if 'train' in mode: |
|
crop_size = list(config.train_dataset_options.crop_size) |
|
|
|
build_deeplab_model(deeplab_model, crop_size) |
|
controller.save_checkpoint() |
|
if mode == 'train': |
|
controller.train( |
|
steps=config.trainer_options.solver_options.training_number_of_steps) |
|
elif mode == 'train_and_eval': |
|
|
|
controller.train_and_evaluate( |
|
train_steps=( |
|
config.trainer_options.solver_options.training_number_of_steps), |
|
eval_steps=config.evaluator_options.eval_steps, |
|
eval_interval=config.evaluator_options.eval_interval) |
|
elif mode == 'eval': |
|
controller.evaluate(steps=config.evaluator_options.eval_steps) |
|
elif mode == 'continuous_eval': |
|
|
|
timeout = config.evaluator_options.continuous_eval_timeout |
|
if timeout == -1: |
|
|
|
timeout = None |
|
controller.evaluate_continuously( |
|
steps=config.evaluator_options.eval_steps, timeout=timeout) |
|
else: |
|
raise ValueError('Mode %s is not a valid mode.' % mode) |
|
|