deeplab2 / trainer /train_lib.py
akhaliq3
spaces demo
506da10
# coding=utf-8
# Copyright 2021 The Deeplab2 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains code to create run an experiment."""
import functools
import os
from typing import Text, Optional, Sequence
from absl import logging
import orbit
import tensorflow as tf
from deeplab2 import common
from deeplab2 import config_pb2
from deeplab2.data import dataset
from deeplab2.model import deeplab
from deeplab2.model.loss import loss_builder
from deeplab2.trainer import distribution_utils
from deeplab2.trainer import evaluator as evaluator_lib
from deeplab2.trainer import runner_utils
from deeplab2.trainer import trainer as trainer_lib
from deeplab2.video import motion_deeplab
from deeplab2.video import vip_deeplab
_INSTANCE_LAYER_NAMES = (common.CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER,
common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER,
common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER)
# For Motion-Deeplab, 6 channels are used as input (2x RGB) during inference.
# Its 7th input channel is obtained by the predicted center heatmap of
# previous frame.
_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS = 6
# All other networks use 3 channels as input (RGB).
_SINGLE_FRAME_INPUT_CHANNELS = 3
def create_deeplab_model(
config: config_pb2.ExperimentOptions,
dataset_descriptor: dataset.DatasetDescriptor) -> tf.keras.Model:
"""Creates DeepLab model based on config."""
if config.model_options.WhichOneof('meta_architecture') == 'motion_deeplab':
return motion_deeplab.MotionDeepLab(config, dataset_descriptor)
elif config.model_options.WhichOneof('meta_architecture') == 'vip_deeplab':
return vip_deeplab.ViPDeepLab(config, dataset_descriptor)
else:
return deeplab.DeepLab(config, dataset_descriptor)
def build_deeplab_model(deeplab_model: tf.keras.Model,
crop_size: Sequence[int],
batch_size: Optional[int] = None):
"""Builds DeepLab model with input crop size."""
if isinstance(deeplab_model, motion_deeplab.MotionDeepLab) or isinstance(
deeplab_model, vip_deeplab.ViPDeepLab):
# Motion-DeepLab and ViP-DeepLab use the input differently despite that
# the input_shape is the same: Motion-DeepLab uses two frames as one input,
# while ViP-DeepLab splits the two frames first and passes them individually
# to the backbone encoder.
input_shape = list(crop_size) + [_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS]
deeplab_model(
tf.keras.Input(input_shape, batch_size=batch_size), training=False)
else:
input_shape = list(crop_size) + [_SINGLE_FRAME_INPUT_CHANNELS]
deeplab_model(
tf.keras.Input(input_shape, batch_size=batch_size), training=False)
return input_shape
def run_experiment(mode: Text, config: config_pb2.ExperimentOptions,
model_dir: Text, tpu: Optional[Text], num_gpus: int):
"""Runs an experiment.
Args:
mode: A string specifying the mode of the experiment. Supported are `train`,
`train_and_eval`, `eval` and `continuous_eval`.
config: A config_pb2.ExperimentOptions configuration.
model_dir: A path to store all checkpoints and other experimental artifacts.
tpu: The name or address of the tpu to connect to, if any.
num_gpus: An integer specifying the number of GPUs to use. If mode contains
`eval`, num_gpus must be less or equal to 1.
Raises:
ValueError: If mode is none of `train`, `train_and_eval`, `eval`, or
`continuous_eval`.
ValueError: If mode is `train_and_eval`, but different dataset_names are
specified for training and evaluation. This error could be relaxed for
applications like domain transferring learning (e.g., synthetic to real
datasets), which has not been fully tested yet.
ValueError: If mode includes `eval` and num_gpus > 1. Currently, evaluation
is not supported on more than a single GPU.
"""
strategy = distribution_utils.create_strategy(tpu, num_gpus)
logging.info('Using strategy %s with %d replicas', type(strategy),
strategy.num_replicas_in_sync)
if 'eval' in mode:
dataset_name = config.eval_dataset_options.dataset
if (mode == 'train_and_eval' and
dataset_name != config.train_dataset_options.dataset):
raise ValueError('Using difference dataset_names in train_and_eval mode.')
if num_gpus > 1:
raise ValueError(
'Using more than one GPU for evaluation is not supported.')
else:
dataset_name = config.train_dataset_options.dataset
num_classes = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].num_classes
ignore_label = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].ignore_label
class_has_instances_list = (
dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].class_has_instances_list)
trainer = None
evaluator = None
with strategy.scope():
deeplab_model = create_deeplab_model(
config, dataset.MAP_NAME_TO_DATASET_INFO[dataset_name])
losses = loss_builder.DeepLabFamilyLoss(config.trainer_options.loss_options,
num_classes, ignore_label,
class_has_instances_list)
global_step = orbit.utils.create_global_step()
if 'train' in mode:
trainer = trainer_lib.Trainer(config, deeplab_model, losses, global_step)
if 'eval' in mode:
evaluator = evaluator_lib.Evaluator(config, deeplab_model, losses,
global_step, model_dir)
checkpoint_dict = dict(global_step=global_step)
checkpoint_dict.update(deeplab_model.checkpoint_items)
if trainer is not None:
checkpoint_dict['optimizer'] = trainer.optimizer
if trainer.backbone_optimizer is not None:
checkpoint_dict['backbone_optimizer'] = trainer.backbone_optimizer
checkpoint = tf.train.Checkpoint(**checkpoint_dict)
# Define items to load from initial checkpoint.
init_dict = deeplab_model.checkpoint_items
if (not config.model_options
.restore_semantic_last_layer_from_initial_checkpoint):
del init_dict[common.CKPT_SEMANTIC_LAST_LAYER]
if (not config.model_options
.restore_instance_last_layer_from_initial_checkpoint):
for layer_name in _INSTANCE_LAYER_NAMES:
if layer_name in init_dict:
del init_dict[layer_name]
init_fn = functools.partial(runner_utils.maybe_load_checkpoint,
config.model_options.initial_checkpoint,
init_dict)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
directory=model_dir,
max_to_keep=config.trainer_options.num_checkpoints_to_keep,
step_counter=global_step,
checkpoint_interval=config.trainer_options.save_checkpoints_steps,
init_fn=init_fn)
controller = orbit.Controller(
strategy=strategy,
trainer=trainer,
evaluator=evaluator,
global_step=global_step,
steps_per_loop=config.trainer_options.steps_per_loop,
checkpoint_manager=checkpoint_manager,
summary_interval=config.trainer_options.save_summaries_steps,
summary_dir=os.path.join(model_dir, 'train'),
eval_summary_dir=os.path.join(model_dir, 'eval'))
with strategy.scope():
# Save initial checkpoint.
if 'train' in mode:
crop_size = list(config.train_dataset_options.crop_size)
# Build model before saving.
build_deeplab_model(deeplab_model, crop_size)
controller.save_checkpoint()
if mode == 'train':
controller.train(
steps=config.trainer_options.solver_options.training_number_of_steps)
elif mode == 'train_and_eval':
# Interleave training and evaluation.
controller.train_and_evaluate(
train_steps=(
config.trainer_options.solver_options.training_number_of_steps),
eval_steps=config.evaluator_options.eval_steps,
eval_interval=config.evaluator_options.eval_interval)
elif mode == 'eval':
controller.evaluate(steps=config.evaluator_options.eval_steps)
elif mode == 'continuous_eval':
# Monitor the checkpoint directory for new checkpoints to evaluate.
timeout = config.evaluator_options.continuous_eval_timeout
if timeout == -1:
# Wait forever
timeout = None
controller.evaluate_continuously(
steps=config.evaluator_options.eval_steps, timeout=timeout)
else:
raise ValueError('Mode %s is not a valid mode.' % mode)