File size: 10,019 Bytes

a664a45
 
 
1e50af8
a664a45

import torch
import torch.nn as nn

from layers import DownsamplingBlock, UpsamplingBlock

class UnetEncoder(nn.Module):
    """Create the Unet Encoder Network.
    
    C64-C128-C256-C512-C512-C512-C512-C512
    """
    def __init__(self, c_in=3, c_out=512):
        """
        Constructs the Unet Encoder Network.

        Ck denote a Convolution-BatchNorm-ReLU layer with k filters.
            C64-C128-C256-C512-C512-C512-C512-C512
        Args:
            c_in (int, optional): Number of input channels.
            c_out (int, optional): Number of output channels. Default is 512.
        """
        super(UnetEncoder, self).__init__()
        self.enc1 = DownsamplingBlock(c_in, 64, use_norm=False) # C64
        self.enc2 = DownsamplingBlock(64, 128) # C128
        self.enc3 = DownsamplingBlock(128, 256) # C256
        self.enc4 = DownsamplingBlock(256, 512) # C512
        self.enc5 = DownsamplingBlock(512, 512) # C512
        self.enc6 = DownsamplingBlock(512, 512) # C512
        self.enc7 = DownsamplingBlock(512, 512) # C512
        self.enc8 = DownsamplingBlock(512, c_out) # C512

    def forward(self, x):
        x1 = self.enc1(x)
        x2 = self.enc2(x1)
        x3 = self.enc3(x2)
        x4 = self.enc4(x3)
        x5 = self.enc5(x4)
        x6 = self.enc6(x5)
        x7 = self.enc7(x6)
        x8 = self.enc8(x7)
        out = [x8, x7, x6, x5, x4, x3, x2, x1] # latest activation is the first element
        return out
    

class UnetDecoder(nn.Module):
    """Creates the Unet Decoder Network.
    """
    def __init__(self, c_in=512, c_out=64, use_upsampling=False, mode='nearest'):
        """
        Constructs the Unet Decoder Network.

        Ck denote a Convolution-BatchNorm-ReLU layer with k filters.
        
        CDk denotes a Convolution-BatchNorm-Dropout-ReLU layer with a dropout rate of 50%.
            CD512-CD1024-CD1024-C1024-C1024-C512-C256-C128
        Args:
            c_in (int): Number of input channels.
            c_out (int, optional): Number of output channels. Default is 512.
            use_upsampling (bool, optional): Upsampling method for decoder. 
                If True, use upsampling layer followed regular convolution layer.
                If False, use transpose convolution. Default is False
            mode (str, optional): the upsampling algorithm: one of 'nearest', 
                'bilinear', 'bicubic'. Default: 'nearest'
        """
        super(UnetDecoder, self).__init__()
        self.dec1 = UpsamplingBlock(c_in, 512, use_dropout=True, use_upsampling=use_upsampling, mode=mode) # CD512
        self.dec2 = UpsamplingBlock(1024, 512, use_dropout=True, use_upsampling=use_upsampling, mode=mode) # CD1024
        self.dec3 = UpsamplingBlock(1024, 512, use_dropout=True, use_upsampling=use_upsampling, mode=mode) # CD1024
        self.dec4 = UpsamplingBlock(1024, 512, use_upsampling=use_upsampling, mode=mode) # C1024
        self.dec5 = UpsamplingBlock(1024, 256, use_upsampling=use_upsampling, mode=mode) # C1024
        self.dec6 = UpsamplingBlock(512, 128, use_upsampling=use_upsampling, mode=mode) # C512
        self.dec7 = UpsamplingBlock(256, 64, use_upsampling=use_upsampling, mode=mode) # C256
        self.dec8 = UpsamplingBlock(128, c_out, use_upsampling=use_upsampling, mode=mode) # C128
    

    def forward(self, x):
        x9 = torch.cat([x[1], self.dec1(x[0])], 1) # (N,1024,H,W)
        x10 = torch.cat([x[2], self.dec2(x9)], 1) # (N,1024,H,W)
        x11 = torch.cat([x[3], self.dec3(x10)], 1) # (N,1024,H,W)
        x12 = torch.cat([x[4], self.dec4(x11)], 1) # (N,1024,H,W)
        x13 = torch.cat([x[5], self.dec5(x12)], 1) # (N,512,H,W)
        x14 = torch.cat([x[6], self.dec6(x13)], 1) # (N,256,H,W)
        x15 = torch.cat([x[7], self.dec7(x14)], 1) # (N,128,H,W)
        out = self.dec8(x15) # (N,64,H,W)
        return out
    

class UnetGenerator(nn.Module):
    """Create a Unet-based generator"""
    def __init__(self, c_in=3, c_out=3, use_upsampling=False, mode='nearest'):
        """
        Constructs a Unet generator
        Args:
            c_in (int): The number of input channels.
            c_out (int): The number of output channels.
            use_upsampling (bool, optional): Upsampling method for decoder. 
                If True, use upsampling layer followed regular convolution layer.
                If False, use transpose convolution. Default is False
            mode (str, optional): the upsampling algorithm: one of 'nearest', 
                'bilinear', 'bicubic'. Default: 'nearest'
        """
        super(UnetGenerator, self).__init__()
        self.encoder = UnetEncoder(c_in=c_in)
        self.decoder = UnetDecoder(use_upsampling=use_upsampling, mode=mode)
        # In the paper, the authors state:
        #   """
        #       After the last layer in the decoder, a convolution is applied
        #       to map to the number of output channels (3 in general, except
        #       in colorization, where it is 2), followed by a Tanh function.
        #   """
        # However, in the official Lua implementation, only a Tanh layer is applied.
        # Therefore, I took the liberty of adding a convolutional layer with a 
        # kernel size of 3.
        # For more information please check the paper and official github repo:
        # https://github.com/phillipi/pix2pix
        # https://arxiv.org/abs/1611.07004
        self.head = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=c_out,
                      kernel_size=3, stride=1, padding=1,
                      bias=True
                      ), 
            nn.Tanh()
            )
    
    def forward(self, x):
        outE = self.encoder(x)
        outD = self.decoder(outE)
        out = self.head(outD)
        return out


class PatchDiscriminator(nn.Module):
    """Create a PatchGAN discriminator"""
    def __init__(self, c_in=3, c_hid=64, n_layers=3):
        """Constructs a PatchGAN discriminator

        Args:
            c_in (int, optional): The number of input channels. Defaults to 3.
            c_hid (int, optional): The number of channels after first conv layer.
                Defaults to 64.
            n_layers (int, optional): the number of convolution blocks in the 
                discriminator. Defaults to 3.
        """
        super(PatchDiscriminator, self).__init__()
        model = [DownsamplingBlock(c_in, c_hid, use_norm=False)]

        n_p = 1  # multiplier for previous channel
        n_c = 1  # multiplier for current channel
        # last block is with stride of 1, therefore iterate (n_layers-1) times
        for n in range(1, n_layers): 
            n_p = n_c
            n_c = min(2**n, 8)  # The number of channels is 512 at most

            model += [DownsamplingBlock(c_hid*n_p, c_hid*n_c)]
        
        n_p = n_c
        n_c = min(2**n_layers, 8)
        model += [DownsamplingBlock(c_hid*n_p, c_hid*n_c, stride=1)] # last block is with stride of 1

        # last layer is a convolution followed by a Sigmoid function.
        model += [nn.Conv2d(in_channels=c_hid*n_c, out_channels=1, 
                            kernel_size=4, stride=1, padding=1, bias=True
                            )] 
        # Normally, there should be a sigmoid layer at the end of discriminator.
        # However, nn.BCEWithLogitsLoss combines the sigmoid layer with BCE loss, 
        # providing greater numerical stability. Therefore, the discriminator outputs
        # logits to take advantage of this stability.
        self.model = nn.Sequential(*model)

    def forward(self, x):
        return self.model(x)
    

class PixelDiscriminator(nn.Module):
    """Create a PixelGAN discriminator (1x1 PatchGAN discriminator)"""
    def __init__(self, c_in=3, c_hid=64):
        """Constructs a PixelGAN discriminator, a special form of PatchGAN Discriminator.
        All convolutions are 1x1 spatial filters

        Args:
            c_in (int, optional): The number of input channels. Defaults to 3.
            c_hid (int, optional): The number of channels after first conv layer.
                Defaults to 64.
        """
        super(PixelDiscriminator, self).__init__()
        self.model = nn.Sequential(
            DownsamplingBlock(c_in, c_hid, kernel_size=1, stride=1, padding=0, use_norm=False),
            DownsamplingBlock(c_hid, c_hid*2, kernel_size=1, stride=1, padding=0),
            nn.Conv2d(in_channels=c_hid*2, out_channels=1, kernel_size=1)
            )
        # Similar to PatchDiscriminator, there should be a sigmoid layer at the end of discriminator.
        # However, nn.BCEWithLogitsLoss combines the sigmoid layer with BCE loss, 
        # providing greater numerical stability. Therefore, the discriminator outputs
        # logits to take advantage of this stability.

    def forward(self, x):
        return self.model(x)
    

class PatchGAN(nn.Module):
    """Create a PatchGAN discriminator"""
    def __init__(self, c_in=3, c_hid=64, mode='patch', n_layers=3):
        """Constructs a PatchGAN discriminator.

        Args:
            c_in (int, optional): The number of input channels. Defaults to 3.
            c_hid (int, optional): The number of channels after first 
                convolutional layer. Defaults to 64.
            mode (str, optional): PatchGAN type. Use 'pixel' for PixelGAN, and 
                'patch' for other types. Defaults to 'patch'.
            n_layers (int, optional): PatchGAN number of layers. Defaults to 3.
                - 16x16 PatchGAN if n=1
                - 34x34 PatchGAN if n=2
                - 70x70 PatchGAN if n=3
                - 142x142 PatchGAN if n=4
                - 286x286 PatchGAN if n=5
                - 574x574 PatchGAN if n=6
        """
        super(PatchGAN, self).__init__()
        if mode == 'pixel':
            self.model = PixelDiscriminator(c_in, c_hid)
        else:
            self.model = PatchDiscriminator(c_in, c_hid, n_layers)
    
    def forward(self, x):
        return self.model(x)