Spaces:

akhaliq
/

deeplab2

Runtime error

deeplab2 / trainer /train_lib.py

akhaliq3

spaces demo

506da10 almost 4 years ago

8.98 kB

	# coding=utf-8
	# Copyright 2021 The Deeplab2 Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""This file contains code to create run an experiment."""
	import functools
	import os
	from typing import Text, Optional, Sequence

	from absl import logging
	import orbit
	import tensorflow as tf

	from deeplab2 import common
	from deeplab2 import config_pb2
	from deeplab2.data import dataset
	from deeplab2.model import deeplab
	from deeplab2.model.loss import loss_builder
	from deeplab2.trainer import distribution_utils
	from deeplab2.trainer import evaluator as evaluator_lib
	from deeplab2.trainer import runner_utils
	from deeplab2.trainer import trainer as trainer_lib
	from deeplab2.video import motion_deeplab
	from deeplab2.video import vip_deeplab

	_INSTANCE_LAYER_NAMES = (common.CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER,
	common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER,
	common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER)
	# For Motion-Deeplab, 6 channels are used as input (2x RGB) during inference.
	# Its 7th input channel is obtained by the predicted center heatmap of
	# previous frame.
	_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS = 6
	# All other networks use 3 channels as input (RGB).
	_SINGLE_FRAME_INPUT_CHANNELS = 3


	def create_deeplab_model(
	config: config_pb2.ExperimentOptions,
	dataset_descriptor: dataset.DatasetDescriptor) -> tf.keras.Model:
	"""Creates DeepLab model based on config."""
	if config.model_options.WhichOneof('meta_architecture') == 'motion_deeplab':
	return motion_deeplab.MotionDeepLab(config, dataset_descriptor)
	elif config.model_options.WhichOneof('meta_architecture') == 'vip_deeplab':
	return vip_deeplab.ViPDeepLab(config, dataset_descriptor)
	else:
	return deeplab.DeepLab(config, dataset_descriptor)


	def build_deeplab_model(deeplab_model: tf.keras.Model,
	crop_size: Sequence[int],
	batch_size: Optional[int] = None):
	"""Builds DeepLab model with input crop size."""
	if isinstance(deeplab_model, motion_deeplab.MotionDeepLab) or isinstance(
	deeplab_model, vip_deeplab.ViPDeepLab):
	# Motion-DeepLab and ViP-DeepLab use the input differently despite that
	# the input_shape is the same: Motion-DeepLab uses two frames as one input,
	# while ViP-DeepLab splits the two frames first and passes them individually
	# to the backbone encoder.
	input_shape = list(crop_size) + [_TWO_FRAME_MOTION_DEEPLAB_INPUT_CHANNELS]
	deeplab_model(
	tf.keras.Input(input_shape, batch_size=batch_size), training=False)
	else:
	input_shape = list(crop_size) + [_SINGLE_FRAME_INPUT_CHANNELS]
	deeplab_model(
	tf.keras.Input(input_shape, batch_size=batch_size), training=False)
	return input_shape


	def run_experiment(mode: Text, config: config_pb2.ExperimentOptions,
	model_dir: Text, tpu: Optional[Text], num_gpus: int):
	"""Runs an experiment.

	Args:
	mode: A string specifying the mode of the experiment. Supported are `train`,
	`train_and_eval`, `eval` and `continuous_eval`.
	config: A config_pb2.ExperimentOptions configuration.
	model_dir: A path to store all checkpoints and other experimental artifacts.
	tpu: The name or address of the tpu to connect to, if any.
	num_gpus: An integer specifying the number of GPUs to use. If mode contains
	`eval`, num_gpus must be less or equal to 1.

	Raises:
	ValueError: If mode is none of `train`, `train_and_eval`, `eval`, or
	`continuous_eval`.
	ValueError: If mode is `train_and_eval`, but different dataset_names are
	specified for training and evaluation. This error could be relaxed for
	applications like domain transferring learning (e.g., synthetic to real
	datasets), which has not been fully tested yet.
	ValueError: If mode includes `eval` and num_gpus > 1. Currently, evaluation
	is not supported on more than a single GPU.
	"""
	strategy = distribution_utils.create_strategy(tpu, num_gpus)
	logging.info('Using strategy %s with %d replicas', type(strategy),
	strategy.num_replicas_in_sync)

	if 'eval' in mode:
	dataset_name = config.eval_dataset_options.dataset
	if (mode == 'train_and_eval' and
	dataset_name != config.train_dataset_options.dataset):
	raise ValueError('Using difference dataset_names in train_and_eval mode.')
	if num_gpus > 1:
	raise ValueError(
	'Using more than one GPU for evaluation is not supported.')
	else:
	dataset_name = config.train_dataset_options.dataset

	num_classes = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].num_classes
	ignore_label = dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].ignore_label
	class_has_instances_list = (
	dataset.MAP_NAME_TO_DATASET_INFO[dataset_name].class_has_instances_list)

	trainer = None
	evaluator = None
	with strategy.scope():
	deeplab_model = create_deeplab_model(
	config, dataset.MAP_NAME_TO_DATASET_INFO[dataset_name])
	losses = loss_builder.DeepLabFamilyLoss(config.trainer_options.loss_options,
	num_classes, ignore_label,
	class_has_instances_list)
	global_step = orbit.utils.create_global_step()
	if 'train' in mode:
	trainer = trainer_lib.Trainer(config, deeplab_model, losses, global_step)
	if 'eval' in mode:
	evaluator = evaluator_lib.Evaluator(config, deeplab_model, losses,
	global_step, model_dir)

	checkpoint_dict = dict(global_step=global_step)
	checkpoint_dict.update(deeplab_model.checkpoint_items)
	if trainer is not None:
	checkpoint_dict['optimizer'] = trainer.optimizer
	if trainer.backbone_optimizer is not None:
	checkpoint_dict['backbone_optimizer'] = trainer.backbone_optimizer
	checkpoint = tf.train.Checkpoint(**checkpoint_dict)

	# Define items to load from initial checkpoint.
	init_dict = deeplab_model.checkpoint_items
	if (not config.model_options
	.restore_semantic_last_layer_from_initial_checkpoint):
	del init_dict[common.CKPT_SEMANTIC_LAST_LAYER]
	if (not config.model_options
	.restore_instance_last_layer_from_initial_checkpoint):
	for layer_name in _INSTANCE_LAYER_NAMES:
	if layer_name in init_dict:
	del init_dict[layer_name]
	init_fn = functools.partial(runner_utils.maybe_load_checkpoint,
	config.model_options.initial_checkpoint,
	init_dict)
	checkpoint_manager = tf.train.CheckpointManager(
	checkpoint,
	directory=model_dir,
	max_to_keep=config.trainer_options.num_checkpoints_to_keep,
	step_counter=global_step,
	checkpoint_interval=config.trainer_options.save_checkpoints_steps,
	init_fn=init_fn)

	controller = orbit.Controller(
	strategy=strategy,
	trainer=trainer,
	evaluator=evaluator,
	global_step=global_step,
	steps_per_loop=config.trainer_options.steps_per_loop,
	checkpoint_manager=checkpoint_manager,
	summary_interval=config.trainer_options.save_summaries_steps,
	summary_dir=os.path.join(model_dir, 'train'),
	eval_summary_dir=os.path.join(model_dir, 'eval'))

	with strategy.scope():
	# Save initial checkpoint.
	if 'train' in mode:
	crop_size = list(config.train_dataset_options.crop_size)
	# Build model before saving.
	build_deeplab_model(deeplab_model, crop_size)
	controller.save_checkpoint()
	if mode == 'train':
	controller.train(
	steps=config.trainer_options.solver_options.training_number_of_steps)
	elif mode == 'train_and_eval':
	# Interleave training and evaluation.
	controller.train_and_evaluate(
	train_steps=(
	config.trainer_options.solver_options.training_number_of_steps),
	eval_steps=config.evaluator_options.eval_steps,
	eval_interval=config.evaluator_options.eval_interval)
	elif mode == 'eval':
	controller.evaluate(steps=config.evaluator_options.eval_steps)
	elif mode == 'continuous_eval':
	# Monitor the checkpoint directory for new checkpoints to evaluate.
	timeout = config.evaluator_options.continuous_eval_timeout
	if timeout == -1:
	# Wait forever
	timeout = None
	controller.evaluate_continuously(
	steps=config.evaluator_options.eval_steps, timeout=timeout)
	else:
	raise ValueError('Mode %s is not a valid mode.' % mode)