File size: 8,983 Bytes
506da10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# coding=utf-8
# Copyright 2021 The Deeplab2 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This file contains code to create run an experiment."""
import functools
import os
from typing import Text, Optional, Sequence

from absl import logging
import orbit
import tensorflow as tf

from deeplab2 import common
from deeplab2 import config_pb2
from deeplab2.data import dataset
from deeplab2.model import deeplab
from deeplab2.model.loss import loss_builder
from deeplab2.trainer import distribution_utils
from deeplab2.trainer import evaluator as evaluator_lib
from deeplab2.trainer import runner_utils
from deeplab2.trainer import trainer as trainer_lib
from deeplab2.video import motion_deeplab
from deeplab2.video import vip_deeplab

_INSTANCE_LAYER_NAMES = (common.CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER,
                         common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER,
                         common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER)
# For Motion-Deeplab, 6 channels are used as input (2x RGB) during inference.
# Its 7th input channel is obtained by the predicted center heatmap of
# previous frame.
_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS = 6
# All other networks use 3 channels as input (RGB).
_SINGLE_FRAME_INPUT_CHANNELS = 3


def create_deeplab_model(
    config: config_pb2.ExperimentOptions,
    dataset_descriptor: dataset.DatasetDescriptor) -> tf.keras.Model:
  """Creates DeepLab model based on config."""
  if config.model_options.WhichOneof('meta_architecture') == 'motion_deeplab':
    return motion_deeplab.MotionDeepLab(config, dataset_descriptor)
  elif config.model_options.WhichOneof('meta_architecture') == 'vip_deeplab':
    return vip_deeplab.ViPDeepLab(config, dataset_descriptor)
  else:
    return deeplab.DeepLab(config, dataset_descriptor)


def build_deeplab_model(deeplab_model: tf.keras.Model,
                        crop_size: Sequence[int],
                        batch_size: Optional[int] = None):
  """Builds DeepLab model with input crop size."""
  if isinstance(deeplab_model, motion_deeplab.MotionDeepLab) or isinstance(
      deeplab_model, vip_deeplab.ViPDeepLab):
    # Motion-DeepLab and ViP-DeepLab use the input differently despite that
    # the input_shape is the same: Motion-DeepLab uses two frames as one input,
    # while ViP-DeepLab splits the two frames first and passes them individually
    # to the backbone encoder.
    input_shape = list(crop_size) + [_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS]
    deeplab_model(
        tf.keras.Input(input_shape, batch_size=batch_size), training=False)
  else:
    input_shape = list(crop_size) + [_SINGLE_FRAME_INPUT_CHANNELS]
    deeplab_model(
        tf.keras.Input(input_shape, batch_size=batch_size), training=False)
  return input_shape


def run_experiment(mode: Text, config: config_pb2.ExperimentOptions,
                   model_dir: Text, tpu: Optional[Text], num_gpus: int):
  """Runs an experiment.

  Args:
    mode: A string specifying the mode of the experiment. Supported are `train`,
      `train_and_eval`, `eval` and `continuous_eval`.
    config: A config_pb2.ExperimentOptions configuration.
    model_dir: A path to store all checkpoints and other experimental artifacts.
    tpu: The name or address of the tpu to connect to, if any.
    num_gpus: An integer specifying the number of GPUs to use. If mode contains
      `eval`, num_gpus must be less or equal to 1.

  Raises:
    ValueError: If mode is none of `train`, `train_and_eval`, `eval`, or
      `continuous_eval`.
    ValueError: If mode is `train_and_eval`, but different dataset_names are
      specified for training and evaluation. This error could be relaxed for
      applications like domain transferring learning (e.g., synthetic to real
      datasets), which has not been fully tested yet.
    ValueError: If mode includes `eval` and num_gpus > 1. Currently, evaluation
      is not supported on more than a single GPU.
  """
  strategy = distribution_utils.create_strategy(tpu, num_gpus)
  logging.info('Using strategy %s with %d replicas', type(strategy),
               strategy.num_replicas_in_sync)

  if 'eval' in mode:
    dataset_name = config.eval_dataset_options.dataset
    if (mode == 'train_and_eval' and
        dataset_name != config.train_dataset_options.dataset):
      raise ValueError('Using difference dataset_names in train_and_eval mode.')
    if num_gpus > 1:
      raise ValueError(
          'Using more than one GPU for evaluation is not supported.')
  else:
    dataset_name = config.train_dataset_options.dataset

  num_classes = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].num_classes
  ignore_label = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].ignore_label
  class_has_instances_list = (
      dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].class_has_instances_list)

  trainer = None
  evaluator = None
  with strategy.scope():
    deeplab_model = create_deeplab_model(
        config, dataset.MAP_NAME_TO_DATASET_INFO[dataset_name])
    losses = loss_builder.DeepLabFamilyLoss(config.trainer_options.loss_options,
                                            num_classes, ignore_label,
                                            class_has_instances_list)
    global_step = orbit.utils.create_global_step()
    if 'train' in mode:
      trainer = trainer_lib.Trainer(config, deeplab_model, losses, global_step)
    if 'eval' in mode:
      evaluator = evaluator_lib.Evaluator(config, deeplab_model, losses,
                                          global_step, model_dir)

  checkpoint_dict = dict(global_step=global_step)
  checkpoint_dict.update(deeplab_model.checkpoint_items)
  if trainer is not None:
    checkpoint_dict['optimizer'] = trainer.optimizer
    if trainer.backbone_optimizer is not None:
      checkpoint_dict['backbone_optimizer'] = trainer.backbone_optimizer
  checkpoint = tf.train.Checkpoint(**checkpoint_dict)

  # Define items to load from initial checkpoint.
  init_dict = deeplab_model.checkpoint_items
  if (not config.model_options
      .restore_semantic_last_layer_from_initial_checkpoint):
    del init_dict[common.CKPT_SEMANTIC_LAST_LAYER]
  if (not config.model_options
      .restore_instance_last_layer_from_initial_checkpoint):
    for layer_name in _INSTANCE_LAYER_NAMES:
      if layer_name in init_dict:
        del init_dict[layer_name]
  init_fn = functools.partial(runner_utils.maybe_load_checkpoint,
                              config.model_options.initial_checkpoint,
                              init_dict)
  checkpoint_manager = tf.train.CheckpointManager(
      checkpoint,
      directory=model_dir,
      max_to_keep=config.trainer_options.num_checkpoints_to_keep,
      step_counter=global_step,
      checkpoint_interval=config.trainer_options.save_checkpoints_steps,
      init_fn=init_fn)

  controller = orbit.Controller(
      strategy=strategy,
      trainer=trainer,
      evaluator=evaluator,
      global_step=global_step,
      steps_per_loop=config.trainer_options.steps_per_loop,
      checkpoint_manager=checkpoint_manager,
      summary_interval=config.trainer_options.save_summaries_steps,
      summary_dir=os.path.join(model_dir, 'train'),
      eval_summary_dir=os.path.join(model_dir, 'eval'))

  with strategy.scope():
    # Save initial checkpoint.
    if 'train' in mode:
      crop_size = list(config.train_dataset_options.crop_size)
      # Build model before saving.
      build_deeplab_model(deeplab_model, crop_size)
      controller.save_checkpoint()
    if mode == 'train':
      controller.train(
          steps=config.trainer_options.solver_options.training_number_of_steps)
    elif mode == 'train_and_eval':
      # Interleave training and evaluation.
      controller.train_and_evaluate(
          train_steps=(
              config.trainer_options.solver_options.training_number_of_steps),
          eval_steps=config.evaluator_options.eval_steps,
          eval_interval=config.evaluator_options.eval_interval)
    elif mode == 'eval':
      controller.evaluate(steps=config.evaluator_options.eval_steps)
    elif mode == 'continuous_eval':
      # Monitor the checkpoint directory for new checkpoints to evaluate.
      timeout = config.evaluator_options.continuous_eval_timeout
      if timeout == -1:
        # Wait forever
        timeout = None
      controller.evaluate_continuously(
          steps=config.evaluator_options.eval_steps, timeout=timeout)
    else:
      raise ValueError('Mode %s is not a valid mode.' % mode)