# coding=utf-8 # Copyright 2021 The Deeplab2 Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """This file contains code to create run an experiment.""" import functools import os from typing import Text, Optional, Sequence from absl import logging import orbit import tensorflow as tf from deeplab2 import common from deeplab2 import config_pb2 from deeplab2.data import dataset from deeplab2.model import deeplab from deeplab2.model.loss import loss_builder from deeplab2.trainer import distribution_utils from deeplab2.trainer import evaluator as evaluator_lib from deeplab2.trainer import runner_utils from deeplab2.trainer import trainer as trainer_lib from deeplab2.video import motion_deeplab from deeplab2.video import vip_deeplab _INSTANCE_LAYER_NAMES = (common.CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER, common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER, common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER) # For Motion-Deeplab, 6 channels are used as input (2x RGB) during inference. # Its 7th input channel is obtained by the predicted center heatmap of # previous frame. _TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS = 6 # All other networks use 3 channels as input (RGB). _SINGLE_FRAME_INPUT_CHANNELS = 3 def create_deeplab_model( config: config_pb2.ExperimentOptions, dataset_descriptor: dataset.DatasetDescriptor) -> tf.keras.Model: """Creates DeepLab model based on config.""" if config.model_options.WhichOneof('meta_architecture') == 'motion_deeplab': return motion_deeplab.MotionDeepLab(config, dataset_descriptor) elif config.model_options.WhichOneof('meta_architecture') == 'vip_deeplab': return vip_deeplab.ViPDeepLab(config, dataset_descriptor) else: return deeplab.DeepLab(config, dataset_descriptor) def build_deeplab_model(deeplab_model: tf.keras.Model, crop_size: Sequence[int], batch_size: Optional[int] = None): """Builds DeepLab model with input crop size.""" if isinstance(deeplab_model, motion_deeplab.MotionDeepLab) or isinstance( deeplab_model, vip_deeplab.ViPDeepLab): # Motion-DeepLab and ViP-DeepLab use the input differently despite that # the input_shape is the same: Motion-DeepLab uses two frames as one input, # while ViP-DeepLab splits the two frames first and passes them individually # to the backbone encoder. input_shape = list(crop_size) + [_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS] deeplab_model( tf.keras.Input(input_shape, batch_size=batch_size), training=False) else: input_shape = list(crop_size) + [_SINGLE_FRAME_INPUT_CHANNELS] deeplab_model( tf.keras.Input(input_shape, batch_size=batch_size), training=False) return input_shape def run_experiment(mode: Text, config: config_pb2.ExperimentOptions, model_dir: Text, tpu: Optional[Text], num_gpus: int): """Runs an experiment. Args: mode: A string specifying the mode of the experiment. Supported are `train`, `train_and_eval`, `eval` and `continuous_eval`. config: A config_pb2.ExperimentOptions configuration. model_dir: A path to store all checkpoints and other experimental artifacts. tpu: The name or address of the tpu to connect to, if any. num_gpus: An integer specifying the number of GPUs to use. If mode contains `eval`, num_gpus must be less or equal to 1. Raises: ValueError: If mode is none of `train`, `train_and_eval`, `eval`, or `continuous_eval`. ValueError: If mode is `train_and_eval`, but different dataset_names are specified for training and evaluation. This error could be relaxed for applications like domain transferring learning (e.g., synthetic to real datasets), which has not been fully tested yet. ValueError: If mode includes `eval` and num_gpus > 1. Currently, evaluation is not supported on more than a single GPU. """ strategy = distribution_utils.create_strategy(tpu, num_gpus) logging.info('Using strategy %s with %d replicas', type(strategy), strategy.num_replicas_in_sync) if 'eval' in mode: dataset_name = config.eval_dataset_options.dataset if (mode == 'train_and_eval' and dataset_name != config.train_dataset_options.dataset): raise ValueError('Using difference dataset_names in train_and_eval mode.') if num_gpus > 1: raise ValueError( 'Using more than one GPU for evaluation is not supported.') else: dataset_name = config.train_dataset_options.dataset num_classes = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].num_classes ignore_label = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].ignore_label class_has_instances_list = ( dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].class_has_instances_list) trainer = None evaluator = None with strategy.scope(): deeplab_model = create_deeplab_model( config, dataset.MAP_NAME_TO_DATASET_INFO[dataset_name]) losses = loss_builder.DeepLabFamilyLoss(config.trainer_options.loss_options, num_classes, ignore_label, class_has_instances_list) global_step = orbit.utils.create_global_step() if 'train' in mode: trainer = trainer_lib.Trainer(config, deeplab_model, losses, global_step) if 'eval' in mode: evaluator = evaluator_lib.Evaluator(config, deeplab_model, losses, global_step, model_dir) checkpoint_dict = dict(global_step=global_step) checkpoint_dict.update(deeplab_model.checkpoint_items) if trainer is not None: checkpoint_dict['optimizer'] = trainer.optimizer if trainer.backbone_optimizer is not None: checkpoint_dict['backbone_optimizer'] = trainer.backbone_optimizer checkpoint = tf.train.Checkpoint(**checkpoint_dict) # Define items to load from initial checkpoint. init_dict = deeplab_model.checkpoint_items if (not config.model_options .restore_semantic_last_layer_from_initial_checkpoint): del init_dict[common.CKPT_SEMANTIC_LAST_LAYER] if (not config.model_options .restore_instance_last_layer_from_initial_checkpoint): for layer_name in _INSTANCE_LAYER_NAMES: if layer_name in init_dict: del init_dict[layer_name] init_fn = functools.partial(runner_utils.maybe_load_checkpoint, config.model_options.initial_checkpoint, init_dict) checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=model_dir, max_to_keep=config.trainer_options.num_checkpoints_to_keep, step_counter=global_step, checkpoint_interval=config.trainer_options.save_checkpoints_steps, init_fn=init_fn) controller = orbit.Controller( strategy=strategy, trainer=trainer, evaluator=evaluator, global_step=global_step, steps_per_loop=config.trainer_options.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_interval=config.trainer_options.save_summaries_steps, summary_dir=os.path.join(model_dir, 'train'), eval_summary_dir=os.path.join(model_dir, 'eval')) with strategy.scope(): # Save initial checkpoint. if 'train' in mode: crop_size = list(config.train_dataset_options.crop_size) # Build model before saving. build_deeplab_model(deeplab_model, crop_size) controller.save_checkpoint() if mode == 'train': controller.train( steps=config.trainer_options.solver_options.training_number_of_steps) elif mode == 'train_and_eval': # Interleave training and evaluation. controller.train_and_evaluate( train_steps=( config.trainer_options.solver_options.training_number_of_steps), eval_steps=config.evaluator_options.eval_steps, eval_interval=config.evaluator_options.eval_interval) elif mode == 'eval': controller.evaluate(steps=config.evaluator_options.eval_steps) elif mode == 'continuous_eval': # Monitor the checkpoint directory for new checkpoints to evaluate. timeout = config.evaluator_options.continuous_eval_timeout if timeout == -1: # Wait forever timeout = None controller.evaluate_continuously( steps=config.evaluator_options.eval_steps, timeout=timeout) else: raise ValueError('Mode %s is not a valid mode.' % mode)