Spaces:

venite
/

sat3density

Runtime error

App Files Files Community

sat3density / imaginaire /generators /spade.py

venite

initial

f670afc almost 2 years ago

raw

history blame contribute delete

26.4 kB

	# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# This work is made available under the Nvidia Source Code License-NC.
	# To view a copy of this license, check out LICENSE.md
	import functools
	import math
	import types

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn import Upsample as NearestUpsample

	from imaginaire.layers import Conv2dBlock, LinearBlock, Res2dBlock
	from imaginaire.utils.data import (get_crop_h_w,
	get_paired_input_image_channel_number,
	get_paired_input_label_channel_number)
	from imaginaire.utils.distributed import master_only_print as print


	class Generator(nn.Module):
	r"""SPADE generator constructor.

	Args:
	gen_cfg (obj): Generator definition part of the yaml config file.
	data_cfg (obj): Data definition part of the yaml config file.
	"""

	def __init__(self, gen_cfg, data_cfg):
	super(Generator, self).__init__()
	print('SPADE generator initialization.')
	# We assume the first datum is the ground truth image.
	image_channels = getattr(gen_cfg, 'image_channels', None)
	if image_channels is None:
	image_channels = get_paired_input_image_channel_number(data_cfg)
	num_labels = getattr(gen_cfg, 'num_labels', None)
	if num_labels is None:
	# Calculate number of channels in the input label when not specified.
	num_labels = get_paired_input_label_channel_number(data_cfg)
	crop_h, crop_w = get_crop_h_w(data_cfg.train.augmentations)
	# Build the generator
	out_image_small_side_size = crop_w if crop_w < crop_h else crop_h
	num_filters = getattr(gen_cfg, 'num_filters', 128)
	kernel_size = getattr(gen_cfg, 'kernel_size', 3)
	weight_norm_type = getattr(gen_cfg, 'weight_norm_type', 'spectral')

	cond_dims = 0
	# Check whether we use the style code.
	style_dims = getattr(gen_cfg, 'style_dims', None)
	self.style_dims = style_dims
	if style_dims is not None:
	print('\tStyle code dimensions: %d' % style_dims)
	cond_dims += style_dims
	self.use_style = True
	else:
	self.use_style = False
	# Check whether we use the attribute code.
	if hasattr(gen_cfg, 'attribute_dims'):
	self.use_attribute = True
	self.attribute_dims = gen_cfg.attribute_dims
	cond_dims += gen_cfg.attribute_dims
	else:
	self.use_attribute = False

	if not self.use_style and not self.use_attribute:
	self.use_style_encoder = False
	else:
	self.use_style_encoder = True
	print('\tBase filter number: %d' % num_filters)
	print('\tConvolution kernel size: %d' % kernel_size)
	print('\tWeight norm type: %s' % weight_norm_type)
	skip_activation_norm = \
	getattr(gen_cfg, 'skip_activation_norm', True)
	activation_norm_params = getattr(gen_cfg, 'activation_norm_params', None)
	if activation_norm_params is None:
	activation_norm_params = types.SimpleNamespace()
	if not hasattr(activation_norm_params, 'num_filters'):
	setattr(activation_norm_params, 'num_filters', 128)
	if not hasattr(activation_norm_params, 'kernel_size'):
	setattr(activation_norm_params, 'kernel_size', 3)
	if not hasattr(activation_norm_params, 'activation_norm_type'):
	setattr(activation_norm_params, 'activation_norm_type', 'sync_batch')
	if not hasattr(activation_norm_params, 'separate_projection'):
	setattr(activation_norm_params, 'separate_projection', False)
	if not hasattr(activation_norm_params, 'activation_norm_params'):
	activation_norm_params.activation_norm_params = types.SimpleNamespace()
	activation_norm_params.activation_norm_params.affine = True
	setattr(activation_norm_params, 'cond_dims', num_labels)
	if not hasattr(activation_norm_params, 'weight_norm_type'):
	setattr(activation_norm_params, 'weight_norm_type', weight_norm_type)
	global_adaptive_norm_type = getattr(gen_cfg, 'global_adaptive_norm_type', 'sync_batch')
	use_posenc_in_input_layer = getattr(gen_cfg, 'use_posenc_in_input_layer', True)
	output_multiplier = getattr(gen_cfg, 'output_multiplier', 1.0)
	print(activation_norm_params)
	self.spade_generator = SPADEGenerator(num_labels,
	out_image_small_side_size,
	image_channels,
	num_filters,
	kernel_size,
	cond_dims,
	activation_norm_params,
	weight_norm_type,
	global_adaptive_norm_type,
	skip_activation_norm,
	use_posenc_in_input_layer,
	self.use_style_encoder,
	output_multiplier)
	if self.use_style:
	# Build the encoder.
	style_enc_cfg = getattr(gen_cfg, 'style_enc', None)
	if style_enc_cfg is None:
	style_enc_cfg = types.SimpleNamespace()
	if not hasattr(style_enc_cfg, 'num_filters'):
	setattr(style_enc_cfg, 'num_filters', 128)
	if not hasattr(style_enc_cfg, 'kernel_size'):
	setattr(style_enc_cfg, 'kernel_size', 3)
	if not hasattr(style_enc_cfg, 'weight_norm_type'):
	setattr(style_enc_cfg, 'weight_norm_type', weight_norm_type)
	setattr(style_enc_cfg, 'input_image_channels', image_channels)
	setattr(style_enc_cfg, 'style_dims', style_dims)
	self.style_encoder = StyleEncoder(style_enc_cfg)

	self.z = None
	print('Done with the SPADE generator initialization.')

	def forward(self, data, random_style=False):
	r"""SPADE Generator forward.

	Args:
	data (dict):
	- images (N x C1 x H x W tensor) : Ground truth images
	- label (N x C2 x H x W tensor) : Semantic representations
	- z (N x style_dims tensor): Gaussian random noise
	- random_style (bool): Whether to sample a random style vector.
	Returns:
	(dict):
	- fake_images (N x 3 x H x W tensor): fake images
	- mu (N x C1 tensor): mean vectors
	- logvar (N x C1 tensor): log-variance vectors
	"""
	if self.use_style_encoder:
	if random_style:
	bs = data['label'].size(0)
	z = torch.randn(
	bs, self.style_dims, dtype=torch.float32).cuda()
	if (data['label'].dtype ==
	data['label'].dtype == torch.float16):
	z = z.half()
	mu = None
	logvar = None
	else:
	mu, logvar, z = self.style_encoder(data['images'])
	if self.use_attribute:
	data['z'] = torch.cat((z, data['attributes'].squeeze(1)), dim=1)
	else:
	data['z'] = z
	output = self.spade_generator(data)
	if self.use_style_encoder:
	output['mu'] = mu
	output['logvar'] = logvar
	return output

	def inference(self,
	data,
	random_style=False,
	use_fixed_random_style=False,
	keep_original_size=False):
	r"""Compute results images for a batch of input data and save the
	results in the specified folder.

	Args:
	data (dict):
	- images (N x C1 x H x W tensor) : Ground truth images
	- label (N x C2 x H x W tensor) : Semantic representations
	- z (N x style_dims tensor): Gaussian random noise
	random_style (bool): Whether to sample a random style vector.
	use_fixed_random_style (bool): Sample random style once and use it
	for all the remaining inference.
	keep_original_size (bool): Keep original size of the input.
	Returns:
	(dict):
	- fake_images (N x 3 x H x W tensor): fake images
	- mu (N x C1 tensor): mean vectors
	- logvar (N x C1 tensor): log-variance vectors
	"""
	self.eval()
	self.spade_generator.eval()

	if self.use_style_encoder:
	if random_style and self.use_style_encoder:
	if self.z is None or not use_fixed_random_style:
	bs = data['label'].size(0)
	z = torch.randn(
	bs, self.style_dims, dtype=torch.float32).to('cuda')
	if (data['label'].dtype ==
	data['label'].dtype ==
	torch.float16):
	z = z.half()
	self.z = z
	else:
	z = self.z
	else:
	mu, logvar, z = self.style_encoder(data['images'])
	data['z'] = z

	output = self.spade_generator(data)
	output_images = output['fake_images']

	if keep_original_size:
	height = data['original_h_w'][0][0]
	width = data['original_h_w'][0][1]
	output_images = torch.nn.functional.interpolate(
	output_images, size=[height, width])

	for key in data['key'].keys():
	if 'segmaps' in key or 'seg_maps' in key:
	file_names = data['key'][key][0]
	break
	for key in data['key'].keys():
	if 'edgemaps' in key or 'edge_maps' in key:
	file_names = data['key'][key][0]
	break

	return output_images, file_names


	class SPADEGenerator(nn.Module):
	r"""SPADE Image Generator constructor.

	Args:
	num_labels (int): Number of different labels.
	out_image_small_side_size (int): min(width, height)
	image_channels (int): Num. of channels of the output image.
	num_filters (int): Base filter numbers.
	kernel_size (int): Convolution kernel size.
	style_dims (int): Dimensions of the style code.
	activation_norm_params (obj): Spatially adaptive normalization param.
	weight_norm_type (str): Type of weight normalization.
	``'none'``, ``'spectral'``, or ``'weight'``.
	global_adaptive_norm_type (str): Type of normalization in SPADE.
	skip_activation_norm (bool): If ``True``, applies activation norm to the
	shortcut connection in residual blocks.
	use_style_encoder (bool): Whether to use global adaptive norm
	like conditional batch norm or adaptive instance norm.
	output_multiplier (float): A positive number multiplied to the output
	"""

	def __init__(self,
	num_labels,
	out_image_small_side_size,
	image_channels,
	num_filters,
	kernel_size,
	style_dims,
	activation_norm_params,
	weight_norm_type,
	global_adaptive_norm_type,
	skip_activation_norm,
	use_posenc_in_input_layer,
	use_style_encoder,
	output_multiplier):
	super(SPADEGenerator, self).__init__()
	self.output_multiplier = output_multiplier
	self.use_style_encoder = use_style_encoder
	self.use_posenc_in_input_layer = use_posenc_in_input_layer
	self.out_image_small_side_size = out_image_small_side_size
	self.num_filters = num_filters
	padding = int(np.ceil((kernel_size - 1.0) / 2))
	nonlinearity = 'leakyrelu'
	activation_norm_type = 'spatially_adaptive'
	base_res2d_block = \
	functools.partial(Res2dBlock,
	kernel_size=kernel_size,
	padding=padding,
	bias=[True, True, False],
	weight_norm_type=weight_norm_type,
	activation_norm_type=activation_norm_type,
	activation_norm_params=activation_norm_params,
	skip_activation_norm=skip_activation_norm,
	nonlinearity=nonlinearity,
	order='NACNAC')
	if self.use_style_encoder:
	self.fc_0 = LinearBlock(style_dims, 2 * style_dims,
	weight_norm_type=weight_norm_type,
	nonlinearity='relu',
	order='CAN')
	self.fc_1 = LinearBlock(2 * style_dims, 2 * style_dims,
	weight_norm_type=weight_norm_type,
	nonlinearity='relu',
	order='CAN')

	adaptive_norm_params = types.SimpleNamespace()
	if not hasattr(adaptive_norm_params, 'cond_dims'):
	setattr(adaptive_norm_params, 'cond_dims', 2 * style_dims)
	if not hasattr(adaptive_norm_params, 'activation_norm_type'):
	setattr(adaptive_norm_params, 'activation_norm_type', global_adaptive_norm_type)
	if not hasattr(adaptive_norm_params, 'weight_norm_type'):
	setattr(adaptive_norm_params, 'weight_norm_type', activation_norm_params.weight_norm_type)
	if not hasattr(adaptive_norm_params, 'separate_projection'):
	setattr(adaptive_norm_params, 'separate_projection', activation_norm_params.separate_projection)
	adaptive_norm_params.activation_norm_params = types.SimpleNamespace()
	setattr(adaptive_norm_params.activation_norm_params, 'affine',
	activation_norm_params.activation_norm_params.affine)
	base_cbn2d_block = \
	functools.partial(Conv2dBlock,
	kernel_size=kernel_size,
	stride=1,
	padding=padding,
	bias=True,
	weight_norm_type=weight_norm_type,
	activation_norm_type='adaptive',
	activation_norm_params=adaptive_norm_params,
	nonlinearity=nonlinearity,
	order='NAC')
	else:
	base_conv2d_block = \
	functools.partial(Conv2dBlock,
	kernel_size=kernel_size,
	stride=1,
	padding=padding,
	bias=True,
	weight_norm_type=weight_norm_type,
	nonlinearity=nonlinearity,
	order='NAC')
	in_num_labels = num_labels
	in_num_labels += 2 if self.use_posenc_in_input_layer else 0
	self.head_0 = Conv2dBlock(in_num_labels, 8 * num_filters,
	kernel_size=kernel_size, stride=1,
	padding=padding,
	weight_norm_type=weight_norm_type,
	activation_norm_type='none',
	nonlinearity=nonlinearity)
	if self.use_style_encoder:
	self.cbn_head_0 = base_cbn2d_block(
	8 * num_filters, 16 * num_filters)
	else:
	self.conv_head_0 = base_conv2d_block(
	8 * num_filters, 16 * num_filters)
	self.head_1 = base_res2d_block(16 * num_filters, 16 * num_filters)
	self.head_2 = base_res2d_block(16 * num_filters, 16 * num_filters)

	self.up_0a = base_res2d_block(16 * num_filters, 8 * num_filters)
	if self.use_style_encoder:
	self.cbn_up_0a = base_cbn2d_block(
	8 * num_filters, 8 * num_filters)
	else:
	self.conv_up_0a = base_conv2d_block(
	8 * num_filters, 8 * num_filters)
	self.up_0b = base_res2d_block(8 * num_filters, 8 * num_filters)

	self.up_1a = base_res2d_block(8 * num_filters, 4 * num_filters)
	if self.use_style_encoder:
	self.cbn_up_1a = base_cbn2d_block(
	4 * num_filters, 4 * num_filters)
	else:
	self.conv_up_1a = base_conv2d_block(
	4 * num_filters, 4 * num_filters)
	self.up_1b = base_res2d_block(4 * num_filters, 4 * num_filters)
	self.up_2a = base_res2d_block(4 * num_filters, 4 * num_filters)
	if self.use_style_encoder:
	self.cbn_up_2a = base_cbn2d_block(
	4 * num_filters, 4 * num_filters)
	else:
	self.conv_up_2a = base_conv2d_block(
	4 * num_filters, 4 * num_filters)
	self.up_2b = base_res2d_block(4 * num_filters, 2 * num_filters)
	self.conv_img256 = Conv2dBlock(2 * num_filters, image_channels,
	5, stride=1, padding=2,
	weight_norm_type=weight_norm_type,
	activation_norm_type='none',
	nonlinearity=nonlinearity,
	order='ANC')
	self.base = 16
	if self.out_image_small_side_size == 512:
	self.up_3a = base_res2d_block(2 * num_filters, 1 * num_filters)
	self.up_3b = base_res2d_block(1 * num_filters, 1 * num_filters)
	self.conv_img512 = Conv2dBlock(1 * num_filters, image_channels,
	5, stride=1, padding=2,
	weight_norm_type=weight_norm_type,
	activation_norm_type='none',
	nonlinearity=nonlinearity,
	order='ANC')
	self.base = 32
	if self.out_image_small_side_size == 1024:
	self.up_3a = base_res2d_block(2 * num_filters, 1 * num_filters)
	self.up_3b = base_res2d_block(1 * num_filters, 1 * num_filters)
	self.conv_img512 = Conv2dBlock(1 * num_filters, image_channels,
	5, stride=1, padding=2,
	weight_norm_type=weight_norm_type,
	activation_norm_type='none',
	nonlinearity=nonlinearity,
	order='ANC')
	self.up_4a = base_res2d_block(num_filters, num_filters // 2)
	self.up_4b = base_res2d_block(num_filters // 2, num_filters // 2)
	self.conv_img1024 = Conv2dBlock(num_filters // 2, image_channels,
	5, stride=1, padding=2,
	weight_norm_type=weight_norm_type,
	activation_norm_type='none',
	nonlinearity=nonlinearity,
	order='ANC')
	self.nearest_upsample4x = NearestUpsample(scale_factor=4, mode='nearest')
	self.base = 64
	if self.out_image_small_side_size != 256 and self.out_image_small_side_size != 512 \
	and self.out_image_small_side_size != 1024:
	raise ValueError('Generation image size (%d, %d) not supported' %
	(self.out_image_small_side_size,
	self.out_image_small_side_size))
	self.nearest_upsample2x = NearestUpsample(scale_factor=2, mode='nearest')

	xv, yv = torch.meshgrid(
	[torch.arange(-1, 1.1, 2. / 15), torch.arange(-1, 1.1, 2. / 15)])
	self.xy = torch.cat((xv.unsqueeze(0), yv.unsqueeze(0)), 0).unsqueeze(0)
	self.xy = self.xy.cuda()

	def forward(self, data):
	r"""SPADE Generator forward.

	Args:
	data (dict):
	- data (N x C1 x H x W tensor) : Ground truth images.
	- label (N x C2 x H x W tensor) : Semantic representations.
	- z (N x style_dims tensor): Gaussian random noise.
	Returns:
	output (dict):
	- fake_images (N x 3 x H x W tensor): Fake images.
	"""
	seg = data['label']

	if self.use_style_encoder:
	z = data['z']
	z = self.fc_0(z)
	z = self.fc_1(z)

	# The code piece below makes sure that the input size is always 16x16
	sy = math.floor(seg.size()[2] * 1.0 / self.base)
	sx = math.floor(seg.size()[3] * 1.0 / self.base)

	in_seg = F.interpolate(seg, size=[sy, sx], mode='nearest')
	if self.use_posenc_in_input_layer:
	in_xy = F.interpolate(self.xy, size=[sy, sx], mode='bicubic')
	in_seg_xy = torch.cat(
	(in_seg, in_xy.expand(in_seg.size()[0], 2, sy, sx)), 1)
	else:
	in_seg_xy = in_seg
	# 16x16
	x = self.head_0(in_seg_xy)
	if self.use_style_encoder:
	x = self.cbn_head_0(x, z)
	else:
	x = self.conv_head_0(x)
	x = self.head_1(x, seg)
	x = self.head_2(x, seg)
	x = self.nearest_upsample2x(x)
	# 32x32
	x = self.up_0a(x, seg)
	if self.use_style_encoder:
	x = self.cbn_up_0a(x, z)
	else:
	x = self.conv_up_0a(x)
	x = self.up_0b(x, seg)
	x = self.nearest_upsample2x(x)
	# 64x64
	x = self.up_1a(x, seg)
	if self.use_style_encoder:
	x = self.cbn_up_1a(x, z)
	else:
	x = self.conv_up_1a(x)
	x = self.up_1b(x, seg)
	x = self.nearest_upsample2x(x)
	# 128x128
	x = self.up_2a(x, seg)
	if self.use_style_encoder:
	x = self.cbn_up_2a(x, z)
	else:
	x = self.conv_up_2a(x)
	x = self.up_2b(x, seg)
	x = self.nearest_upsample2x(x)
	# 256x256
	if self.out_image_small_side_size == 256:
	x256 = self.conv_img256(x)
	x = torch.tanh(self.output_multiplier * x256)
	# 512x512
	elif self.out_image_small_side_size == 512:
	x256 = self.conv_img256(x)
	x256 = self.nearest_upsample2x(x256)
	x = self.up_3a(x, seg)
	x = self.up_3b(x, seg)
	x = self.nearest_upsample2x(x)
	x512 = self.conv_img512(x)
	x = torch.tanh(self.output_multiplier * (x256 + x512))
	# 1024x1024
	elif self.out_image_small_side_size == 1024:
	x256 = self.conv_img256(x)
	x256 = self.nearest_upsample4x(x256)
	x = self.up_3a(x, seg)
	x = self.up_3b(x, seg)
	x = self.nearest_upsample2x(x)
	x512 = self.conv_img512(x)
	x512 = self.nearest_upsample2x(x512)
	x = self.up_4a(x, seg)
	x = self.up_4b(x, seg)
	x = self.nearest_upsample2x(x)
	x1024 = self.conv_img1024(x)
	x = torch.tanh(self.output_multiplier * (x256 + x512 + x1024))
	output = dict()
	output['fake_images'] = x
	return output


	class StyleEncoder(nn.Module):
	r"""Style Encode constructor.

	Args:
	style_enc_cfg (obj): Style encoder definition file.
	"""

	def __init__(self, style_enc_cfg):
	super(StyleEncoder, self).__init__()
	input_image_channels = style_enc_cfg.input_image_channels
	num_filters = style_enc_cfg.num_filters
	kernel_size = style_enc_cfg.kernel_size
	padding = int(np.ceil((kernel_size - 1.0) / 2))
	style_dims = style_enc_cfg.style_dims
	weight_norm_type = style_enc_cfg.weight_norm_type
	activation_norm_type = 'none'
	nonlinearity = 'leakyrelu'
	base_conv2d_block = \
	functools.partial(Conv2dBlock,
	kernel_size=kernel_size,
	stride=2,
	padding=padding,
	weight_norm_type=weight_norm_type,
	activation_norm_type=activation_norm_type,
	# inplace_nonlinearity=True,
	nonlinearity=nonlinearity)
	self.layer1 = base_conv2d_block(input_image_channels, num_filters)
	self.layer2 = base_conv2d_block(num_filters * 1, num_filters * 2)
	self.layer3 = base_conv2d_block(num_filters * 2, num_filters * 4)
	self.layer4 = base_conv2d_block(num_filters * 4, num_filters * 8)
	self.layer5 = base_conv2d_block(num_filters * 8, num_filters * 8)
	self.layer6 = base_conv2d_block(num_filters * 8, num_filters * 8)
	self.fc_mu = LinearBlock(num_filters * 8 * 4 * 4, style_dims)
	self.fc_var = LinearBlock(num_filters * 8 * 4 * 4, style_dims)

	def forward(self, input_x):
	r"""SPADE Style Encoder forward.

	Args:
	input_x (N x 3 x H x W tensor): input images.
	Returns:
	(tuple):
	- mu (N x C tensor): Mean vectors.
	- logvar (N x C tensor): Log-variance vectors.
	- z (N x C tensor): Style code vectors.
	"""
	if input_x.size(2) != 256 or input_x.size(3) != 256:
	input_x = F.interpolate(input_x, size=(256, 256), mode='bilinear')
	x = self.layer1(input_x)
	x = self.layer2(x)
	x = self.layer3(x)
	x = self.layer4(x)
	x = self.layer5(x)
	x = self.layer6(x)
	x = x.view(x.size(0), -1)
	mu = self.fc_mu(x)
	logvar = self.fc_var(x)
	std = torch.exp(0.5 * logvar)
	eps = torch.randn_like(std)
	z = eps.mul(std) + mu
	return mu, logvar, z