resefa / models /ghfeat_encoder.py
akhaliq's picture
akhaliq HF staff
add files
8ca3a29
# python3.7
"""Contains the implementation of encoder used in GH-Feat (including IDInvert).
ResNet is used as the backbone.
GH-Feat paper: https://arxiv.org/pdf/2007.10379.pdf
IDInvert paper: https://arxiv.org/pdf/2004.00049.pdf
NOTE: Please use `latent_num` and `num_latents_per_head` to control the
inversion space, such as Y-space used in GH-Feat and W-space used in IDInvert.
In addition, IDInvert sets `use_fpn` and `use_sam` as `False` by default.
"""
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
__all__ = ['GHFeatEncoder']
# Resolutions allowed.
_RESOLUTIONS_ALLOWED = [8, 16, 32, 64, 128, 256, 512, 1024]
# pylint: disable=missing-function-docstring
class BasicBlock(nn.Module):
"""Implementation of ResNet BasicBlock."""
expansion = 1
def __init__(self,
inplanes,
planes,
base_width=64,
stride=1,
groups=1,
dilation=1,
norm_layer=None,
downsample=None):
super().__init__()
if base_width != 64:
raise ValueError(f'BasicBlock of ResNet only supports '
f'`base_width=64`, but {base_width} received!')
if stride not in [1, 2]:
raise ValueError(f'BasicBlock of ResNet only supports `stride=1` '
f'and `stride=2`, but {stride} received!')
if groups != 1:
raise ValueError(f'BasicBlock of ResNet only supports `groups=1`, '
f'but {groups} received!')
if dilation != 1:
raise ValueError(f'BasicBlock of ResNet only supports '
f'`dilation=1`, but {dilation} received!')
assert self.expansion == 1
self.stride = stride
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self.conv1 = nn.Conv2d(in_channels=inplanes,
out_channels=planes,
kernel_size=3,
stride=stride,
padding=1,
groups=1,
dilation=1,
bias=False)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(in_channels=planes,
out_channels=planes,
kernel_size=3,
stride=1,
padding=1,
groups=1,
dilation=1,
bias=False)
self.bn2 = norm_layer(planes)
self.downsample = downsample
def forward(self, x):
identity = self.downsample(x) if self.downsample is not None else x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out + identity)
return out
class Bottleneck(nn.Module):
"""Implementation of ResNet Bottleneck."""
expansion = 4
def __init__(self,
inplanes,
planes,
base_width=64,
stride=1,
groups=1,
dilation=1,
norm_layer=None,
downsample=None):
super().__init__()
if stride not in [1, 2]:
raise ValueError(f'Bottleneck of ResNet only supports `stride=1` '
f'and `stride=2`, but {stride} received!')
width = int(planes * (base_width / 64)) * groups
self.stride = stride
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self.conv1 = nn.Conv2d(in_channels=inplanes,
out_channels=width,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=False)
self.bn1 = norm_layer(width)
self.conv2 = nn.Conv2d(in_channels=width,
out_channels=width,
kernel_size=3,
stride=stride,
padding=dilation,
groups=groups,
dilation=dilation,
bias=False)
self.bn2 = norm_layer(width)
self.conv3 = nn.Conv2d(in_channels=width,
out_channels=planes * self.expansion,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=False)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
def forward(self, x):
identity = self.downsample(x) if self.downsample is not None else x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
out = self.relu(out + identity)
return out
class GHFeatEncoder(nn.Module):
"""Define the ResNet-based encoder network for GAN inversion.
On top of the backbone, there are several task-heads to produce inverted
codes. Please use `latent_dim` and `num_latents_per_head` to define the
structure. For example, `latent_dim = [512] * 14` and
`num_latents_per_head = [4, 4, 6]` can be used for StyleGAN inversion with
14-layer latent codes, where 3 task heads (corresponding to 4, 4, 6 layers,
respectively) are used.
Settings for the encoder network:
(1) resolution: The resolution of the output image.
(2) latent_dim: Dimension of the latent space. A number (one code will be
produced), or a list of numbers regarding layer-wise latent codes.
(3) num_latents_per_head: Number of latents that is produced by each head.
(4) image_channels: Number of channels of the output image. (default: 3)
(5) final_res: Final resolution of the convolutional layers. (default: 4)
ResNet-related settings:
(1) network_depth: Depth of the network, like 18 for ResNet18. (default: 18)
(2) inplanes: Number of channels of the first convolutional layer.
(default: 64)
(3) groups: Groups of the convolution, used in ResNet. (default: 1)
(4) width_per_group: Number of channels per group, used in ResNet.
(default: 64)
(5) replace_stride_with_dilation: Whether to replace stride with dilation,
used in ResNet. (default: None)
(6) norm_layer: Normalization layer used in the encoder. If set as `None`,
`nn.BatchNorm2d` will be used. Also, please NOTE that when using batch
normalization, the batch size is required to be larger than one for
training. (default: nn.BatchNorm2d)
(7) max_channels: Maximum number of channels in each layer. (default: 512)
Task-head related settings:
(1) use_fpn: Whether to use Feature Pyramid Network (FPN) before outputting
the latent code. (default: True)
(2) fpn_channels: Number of channels used in FPN. (default: 512)
(3) use_sam: Whether to use Spatial Alignment Module (SAM) before outputting
the latent code. (default: True)
(4) sam_channels: Number of channels used in SAM. (default: 512)
"""
arch_settings = {
18: (BasicBlock, [2, 2, 2, 2]),
34: (BasicBlock, [3, 4, 6, 3]),
50: (Bottleneck, [3, 4, 6, 3]),
101: (Bottleneck, [3, 4, 23, 3]),
152: (Bottleneck, [3, 8, 36, 3])
}
def __init__(self,
resolution,
latent_dim,
num_latents_per_head,
image_channels=3,
final_res=4,
network_depth=18,
inplanes=64,
groups=1,
width_per_group=64,
replace_stride_with_dilation=None,
norm_layer=nn.BatchNorm2d,
max_channels=512,
use_fpn=True,
fpn_channels=512,
use_sam=True,
sam_channels=512):
super().__init__()
if resolution not in _RESOLUTIONS_ALLOWED:
raise ValueError(f'Invalid resolution: `{resolution}`!\n'
f'Resolutions allowed: {_RESOLUTIONS_ALLOWED}.')
if network_depth not in self.arch_settings:
raise ValueError(f'Invalid network depth: `{network_depth}`!\n'
f'Options allowed: '
f'{list(self.arch_settings.keys())}.')
if isinstance(latent_dim, int):
latent_dim = [latent_dim]
assert isinstance(latent_dim, (list, tuple))
assert isinstance(num_latents_per_head, (list, tuple))
assert sum(num_latents_per_head) == len(latent_dim)
self.resolution = resolution
self.latent_dim = latent_dim
self.num_latents_per_head = num_latents_per_head
self.num_heads = len(self.num_latents_per_head)
self.image_channels = image_channels
self.final_res = final_res
self.inplanes = inplanes
self.network_depth = network_depth
self.groups = groups
self.dilation = 1
self.base_width = width_per_group
self.replace_stride_with_dilation = replace_stride_with_dilation
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if norm_layer == nn.BatchNorm2d and dist.is_initialized():
norm_layer = nn.SyncBatchNorm
self.norm_layer = norm_layer
self.max_channels = max_channels
self.use_fpn = use_fpn
self.fpn_channels = fpn_channels
self.use_sam = use_sam
self.sam_channels = sam_channels
block_fn, num_blocks_per_stage = self.arch_settings[network_depth]
self.num_stages = int(np.log2(resolution // final_res)) - 1
# Add one block for additional stages.
for i in range(len(num_blocks_per_stage), self.num_stages):
num_blocks_per_stage.append(1)
if replace_stride_with_dilation is None:
replace_stride_with_dilation = [False] * self.num_stages
# Backbone.
self.conv1 = nn.Conv2d(in_channels=self.image_channels,
out_channels=self.inplanes,
kernel_size=7,
stride=2,
padding=3,
bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.stage_channels = [self.inplanes]
self.stages = nn.ModuleList()
for i in range(self.num_stages):
inplanes = self.inplanes if i == 0 else planes * block_fn.expansion
planes = min(self.max_channels, self.inplanes * (2 ** i))
num_blocks = num_blocks_per_stage[i]
stride = 1 if i == 0 else 2
dilate = replace_stride_with_dilation[i]
self.stages.append(self._make_stage(block_fn=block_fn,
inplanes=inplanes,
planes=planes,
num_blocks=num_blocks,
stride=stride,
dilate=dilate))
self.stage_channels.append(planes * block_fn.expansion)
if self.num_heads > len(self.stage_channels):
raise ValueError('Number of task heads is larger than number of '
'stages! Please reduce the number of heads.')
# Task-head.
if self.num_heads == 1:
self.use_fpn = False
self.use_sam = False
if self.use_fpn:
fpn_pyramid_channels = self.stage_channels[-self.num_heads:]
self.fpn = FPN(pyramid_channels=fpn_pyramid_channels,
out_channels=self.fpn_channels)
if self.use_sam:
if self.use_fpn:
sam_pyramid_channels = [self.fpn_channels] * self.num_heads
else:
sam_pyramid_channels = self.stage_channels[-self.num_heads:]
self.sam = SAM(pyramid_channels=sam_pyramid_channels,
out_channels=self.sam_channels)
self.heads = nn.ModuleList()
for head_idx in range(self.num_heads):
# Parse in_channels.
if self.use_sam:
in_channels = self.sam_channels
elif self.use_fpn:
in_channels = self.fpn_channels
else:
in_channels = self.stage_channels[head_idx - self.num_heads]
in_channels = in_channels * final_res * final_res
# Parse out_channels.
start_latent_idx = sum(self.num_latents_per_head[:head_idx])
end_latent_idx = sum(self.num_latents_per_head[:head_idx + 1])
out_channels = sum(self.latent_dim[start_latent_idx:end_latent_idx])
self.heads.append(CodeHead(in_channels=in_channels,
out_channels=out_channels,
norm_layer=self.norm_layer))
def _make_stage(self,
block_fn,
inplanes,
planes,
num_blocks,
stride,
dilate):
norm_layer = self.norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or inplanes != planes * block_fn.expansion:
downsample = nn.Sequential(
nn.Conv2d(in_channels=inplanes,
out_channels=planes * block_fn.expansion,
kernel_size=1,
stride=stride,
padding=0,
dilation=1,
groups=1,
bias=False),
norm_layer(planes * block_fn.expansion),
)
blocks = []
blocks.append(block_fn(inplanes=inplanes,
planes=planes,
base_width=self.base_width,
stride=stride,
groups=self.groups,
dilation=previous_dilation,
norm_layer=norm_layer,
downsample=downsample))
for _ in range(1, num_blocks):
blocks.append(block_fn(inplanes=planes * block_fn.expansion,
planes=planes,
base_width=self.base_width,
stride=1,
groups=self.groups,
dilation=self.dilation,
norm_layer=norm_layer,
downsample=None))
return nn.Sequential(*blocks)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
features = [x]
for i in range(self.num_stages):
x = self.stages[i](x)
features.append(x)
features = features[-self.num_heads:]
if self.use_fpn:
features = self.fpn(features)
if self.use_sam:
features = self.sam(features)
else:
final_size = features[-1].shape[2:]
for i in range(self.num_heads - 1):
features[i] = F.adaptive_avg_pool2d(features[i], final_size)
outputs = []
for head_idx in range(self.num_heads):
codes = self.heads[head_idx](features[head_idx])
start_latent_idx = sum(self.num_latents_per_head[:head_idx])
end_latent_idx = sum(self.num_latents_per_head[:head_idx + 1])
split_size = self.latent_dim[start_latent_idx:end_latent_idx]
outputs.extend(torch.split(codes, split_size, dim=1))
max_dim = max(self.latent_dim)
for i, dim in enumerate(self.latent_dim):
if dim < max_dim:
outputs[i] = F.pad(outputs[i], (0, max_dim - dim))
outputs[i] = outputs[i].unsqueeze(1)
return torch.cat(outputs, dim=1)
class FPN(nn.Module):
"""Implementation of Feature Pyramid Network (FPN).
The input of this module is a pyramid of features with reducing resolutions.
Then, this module fuses these multi-level features from `top_level` to
`bottom_level`. In particular, starting from the `top_level`, each feature
is convoluted, upsampled, and fused into its previous feature (which is also
convoluted).
Args:
pyramid_channels: A list of integers, each of which indicates the number
of channels of the feature from a particular level.
out_channels: Number of channels for each output.
Returns:
A list of feature maps, each of which has `out_channels` channels.
"""
def __init__(self, pyramid_channels, out_channels):
super().__init__()
assert isinstance(pyramid_channels, (list, tuple))
self.num_levels = len(pyramid_channels)
self.lateral_layers = nn.ModuleList()
self.feature_layers = nn.ModuleList()
for i in range(self.num_levels):
in_channels = pyramid_channels[i]
self.lateral_layers.append(nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
padding=1,
bias=True))
self.feature_layers.append(nn.Conv2d(in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
padding=1,
bias=True))
def forward(self, inputs):
if len(inputs) != self.num_levels:
raise ValueError('Number of inputs and `num_levels` mismatch!')
# Project all related features to `out_channels`.
laterals = []
for i in range(self.num_levels):
laterals.append(self.lateral_layers[i](inputs[i]))
# Fusion, starting from `top_level`.
for i in range(self.num_levels - 1, 0, -1):
scale_factor = laterals[i - 1].shape[2] // laterals[i].shape[2]
laterals[i - 1] = (laterals[i - 1] +
F.interpolate(laterals[i],
mode='nearest',
scale_factor=scale_factor))
# Get outputs.
outputs = []
for i, lateral in enumerate(laterals):
outputs.append(self.feature_layers[i](lateral))
return outputs
class SAM(nn.Module):
"""Implementation of Spatial Alignment Module (SAM).
The input of this module is a pyramid of features with reducing resolutions.
Then this module downsamples all levels of feature to the minimum resolution
and fuses it with the smallest feature map.
Args:
pyramid_channels: A list of integers, each of which indicates the number
of channels of the feature from a particular level.
out_channels: Number of channels for each output.
Returns:
A list of feature maps, each of which has `out_channels` channels.
"""
def __init__(self, pyramid_channels, out_channels):
super().__init__()
assert isinstance(pyramid_channels, (list, tuple))
self.num_levels = len(pyramid_channels)
self.fusion_layers = nn.ModuleList()
for i in range(self.num_levels):
in_channels = pyramid_channels[i]
self.fusion_layers.append(nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
padding=1,
bias=True))
def forward(self, inputs):
if len(inputs) != self.num_levels:
raise ValueError('Number of inputs and `num_levels` mismatch!')
output_res = inputs[-1].shape[2:]
for i in range(self.num_levels - 1, -1, -1):
if i != self.num_levels - 1:
inputs[i] = F.adaptive_avg_pool2d(inputs[i], output_res)
inputs[i] = self.fusion_layers[i](inputs[i])
if i != self.num_levels - 1:
inputs[i] = inputs[i] + inputs[-1]
return inputs
class CodeHead(nn.Module):
"""Implementation of the task-head to produce inverted codes."""
def __init__(self, in_channels, out_channels, norm_layer):
super().__init__()
self.fc = nn.Linear(in_channels, out_channels, bias=True)
if norm_layer is None:
self.norm = nn.Identity()
else:
self.norm = norm_layer(out_channels)
def forward(self, x):
if x.ndim > 2:
x = x.flatten(start_dim=1)
latent = self.fc(x)
latent = latent.unsqueeze(2).unsqueeze(3)
latent = self.norm(latent)
return latent.flatten(start_dim=1)
# pylint: enable=missing-function-docstring