Upload 5 files

Browse files

Files changed (5) hide show

attentionLayer.py +39 -0
audioEncoder.py +108 -0
convLayer.py +42 -0
loconet_encoder.py +90 -0
visualEncoder.py +199 -0

attentionLayer.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn import MultiheadAttention
+class attentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.1):
+        super(attentionLayer, self).__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.linear1 = nn.Linear(d_model, d_model * 4)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_model * 4, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = F.relu
+    def forward(self, src, tar, adjust=False, attn_mask=None):
+        # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
+        src = src.transpose(0, 1)    # B, T, C -> T, B, C
+        tar = tar.transpose(0, 1)    # B, T, C -> T, B, C
+        if adjust:
+            src2 = self.self_attn(src, tar, tar, attn_mask=None, key_padding_mask=None)[0]
+        else:
+            src2 = self.self_attn(tar, src, src, attn_mask=None, key_padding_mask=None)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        src = src.transpose(0, 1)    # T, B, C -> B, T, C
+        return src

audioEncoder.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SEBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=8):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+                nn.Linear(channel, channel // reduction),
+                nn.ReLU(inplace=True),
+                nn.Linear(channel // reduction, channel),
+                nn.Sigmoid()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+class audioEncoder(nn.Module):
+    def __init__(self, layers, num_filters, **kwargs):
+        super(audioEncoder, self).__init__()
+        block = SEBasicBlock
+        self.inplanes   = num_filters[0]
+        self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(num_filters[0])
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(block, num_filters[0], layers[0])
+        self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
+        self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
+        self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
+        out_dim = num_filters[3] * block.expansion
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = torch.mean(x, dim=2, keepdim=True)
+        x = x.view((x.size()[0], x.size()[1], -1))
+        x = x.transpose(1, 2)
+        return x

convLayer.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class ConvLayer(nn.Module):
+    def __init__(self, cfg):
+        super(ConvLayer, self).__init__()
+        self.cfg = cfg
+        self.s = cfg.num_speakers
+        self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (self.s, 7), padding=(0, 3))
+        # below line is speaker parallel 93.88 code
+        # self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (3, 7), padding=(0, 3))
+        self.ln = torch.nn.LayerNorm(256)
+        self.conv2d_1x1 = torch.nn.Conv2d(256, 512, (1, 1), padding=(0, 0))
+        self.conv2d_1x1_2 = torch.nn.Conv2d(512, 256, (1, 1), padding=(0, 0))
+        self.gelu = nn.GELU()
+    def forward(self, x, b, s):
+        identity = x    # b*s, t, c
+        t = x.shape[1]
+        c = x.shape[2]
+        out = x.view(b, s, t, c)
+        out = out.permute(0, 3, 1, 2)    # b, c, s, t
+        out = self.conv2d(out)    # b, s*c, 1, t
+        out = out.view(b, c, s, t)
+        out = out.permute(0, 2, 3, 1)    # b, s, t, c
+        out = self.ln(out)
+        out = out.permute(0, 3, 1, 2)
+        out = self.conv2d_1x1(out)
+        out = self.gelu(out)
+        out = self.conv2d_1x1_2(out)    # b, c, s, t
+        out = out.permute(0, 2, 3, 1)    # b, s, t, c
+        out = out.view(b * s, t, c)
+        out += identity
+        return out, b, s

loconet_encoder.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+import torch.nn as nn
+from attentionLayer import attentionLayer
+from convLayer import ConvLayer
+from torchvggish import vggish
+from visualEncoder import visualFrontend, visualConv1D, visualTCN
+class locoencoder(nn.Module):
+    def __init__(self, cfg):
+        super(locoencoder, self).__init__()
+        self.cfg = cfg
+        # Visual Temporal Encoder
+        self.visualFrontend = visualFrontend(cfg)    # Visual Frontend
+        self.visualTCN = visualTCN()    # Visual Temporal Network TCN
+        self.visualConv1D = visualConv1D()    # Visual Temporal Network Conv1d
+        urls = {
+            'vggish':
+                "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth"
+        }
+        self.audioEncoder = vggish.VGGish(urls, preprocess=False, postprocess=False)
+        self.audio_pool = nn.AdaptiveAvgPool1d(1)
+        # Audio-visual Cross Attention
+        self.crossA2V = attentionLayer(d_model=128, nhead=8)
+        self.crossV2A = attentionLayer(d_model=128, nhead=8)
+        # Audio-visual Self Attention
+        num_layers = self.cfg.av_layers
+        layers = nn.ModuleList()
+        for i in range(num_layers):
+            layers.append(ConvLayer(cfg))
+            layers.append(attentionLayer(d_model=256, nhead=8))
+        self.convAV = layers
+    def forward_visual_frontend(self, x):
+        B, T, W, H = x.shape
+        x = x.view(B * T, 1, 1, W, H)
+        x = (x / 255 - 0.4161) / 0.1688
+        x = self.visualFrontend(x)
+        x = x.view(B, T, 512)
+        x = x.transpose(1, 2)
+        x = self.visualTCN(x)
+        x = self.visualConv1D(x)
+        x = x.transpose(1, 2)
+        return x
+    def forward_audio_frontend(self, x):
+        t = x.shape[-2]
+        numFrames = t // 4
+        pad = 8 - (t % 8)
+        x = torch.nn.functional.pad(x, (0, 0, 0, pad), "constant")
+        # x = x.unsqueeze(1).transpose(2, 3)
+        x = self.audioEncoder(x)
+        b, c, t2, freq = x.shape
+        x = x.view(b * c, t2, freq)
+        x = self.audio_pool(x)
+        x = x.view(b, c, t2)[:, :, :numFrames]
+        x = x.permute(0, 2, 1)
+        return x
+    def forward_cross_attention(self, x1, x2):
+        x1_c = self.crossA2V(src=x1, tar=x2, adjust=self.cfg.adjust_attention)
+        x2_c = self.crossV2A(src=x2, tar=x1, adjust=self.cfg.adjust_attention)
+        return x1_c, x2_c
+    def forward_audio_visual_backend(self, x1, x2, b=1, s=1):
+        x = torch.cat((x1, x2), 2)    # B*S, T, 2C
+        for i, layer in enumerate(self.convAV):
+            if i % 2 == 0:
+                x, b, s = layer(x, b, s)
+            else:
+                x = layer(src=x, tar=x)
+        x = torch.reshape(x, (-1, 256))
+        return x
+    def forward_audio_backend(self, x):
+        x = torch.reshape(x, (-1, 128))
+        return x
+    def forward_visual_backend(self, x):
+        x = torch.reshape(x, (-1, 128))
+        return x

visualEncoder.py ADDED Viewed

	@@ -0,0 +1,199 @@

+##
+# ResNet18 Pretrained network to extract lip embedding
+# This code is modified based on https://github.com/lordmartian/deep_avsr
+##
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from attentionLayer import attentionLayer
+class ResNetLayer(nn.Module):
+    """
+    A ResNet layer used to build the ResNet network.
+    Architecture:
+    --> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu -->
+     |                        |   |                                    |
+     -----> downsample ------>    ------------------------------------->
+    """
+    def __init__(self, inplanes, outplanes, stride):
+        super(ResNetLayer, self).__init__()
+        self.conv1a = nn.Conv2d(inplanes,
+                                outplanes,
+                                kernel_size=3,
+                                stride=stride,
+                                padding=1,
+                                bias=False)
+        self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        self.conv2a = nn.Conv2d(outplanes,
+                                outplanes,
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False)
+        self.stride = stride
+        if self.stride != 1:
+            self.downsample = nn.Conv2d(inplanes,
+                                        outplanes,
+                                        kernel_size=(1, 1),
+                                        stride=stride,
+                                        bias=False)
+        self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        self.conv1b = nn.Conv2d(outplanes,
+                                outplanes,
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False)
+        self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        self.conv2b = nn.Conv2d(outplanes,
+                                outplanes,
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False)
+        self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        return
+    def forward(self, inputBatch):
+        batch = F.relu(self.bn1a(self.conv1a(inputBatch)))
+        batch = self.conv2a(batch)
+        if self.stride == 1:
+            residualBatch = inputBatch
+        else:
+            residualBatch = self.downsample(inputBatch)
+        batch = batch + residualBatch
+        intermediateBatch = batch
+        batch = F.relu(self.outbna(batch))
+        batch = F.relu(self.bn1b(self.conv1b(batch)))
+        batch = self.conv2b(batch)
+        residualBatch = intermediateBatch
+        batch = batch + residualBatch
+        outputBatch = F.relu(self.outbnb(batch))
+        return outputBatch
+class ResNet(nn.Module):
+    """
+    An 18-layer ResNet architecture.
+    """
+    def __init__(self):
+        super(ResNet, self).__init__()
+        self.layer1 = ResNetLayer(64, 64, stride=1)
+        self.layer2 = ResNetLayer(64, 128, stride=2)
+        self.layer3 = ResNetLayer(128, 256, stride=2)
+        self.layer4 = ResNetLayer(256, 512, stride=2)
+        self.avgpool = nn.AvgPool2d(kernel_size=(4, 4), stride=(1, 1))
+        return
+    def forward(self, inputBatch):
+        batch = self.layer1(inputBatch)
+        batch = self.layer2(batch)
+        batch = self.layer3(batch)
+        batch = self.layer4(batch)
+        outputBatch = self.avgpool(batch)
+        return outputBatch
+class GlobalLayerNorm(nn.Module):
+    def __init__(self, channel_size):
+        super(GlobalLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))    # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))    # [1, N, 1]
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)    #[M, 1, 1]
+        var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
+        gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta
+        return gLN_y
+class visualFrontend(nn.Module):
+    """
+    A visual feature extraction module. Generates a 512-dim feature vector per video frame.
+    Architecture: A 3D convolution block followed by an 18-layer ResNet.
+    """
+    def __init__(self, cfg):
+        self.cfg = cfg
+        super(visualFrontend, self).__init__()
+        self.frontend3D = nn.Sequential(
+            nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3),
+                      bias=False), nn.BatchNorm3d(64, momentum=0.01, eps=0.001), nn.ReLU(),
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
+        self.resnet = ResNet()
+        return
+    def forward(self, inputBatch):
+        inputBatch = inputBatch.transpose(0, 1).transpose(1, 2)
+        batchsize = inputBatch.shape[0]
+        batch = self.frontend3D(inputBatch)
+        batch = batch.transpose(1, 2)
+        batch = batch.reshape(batch.shape[0] * batch.shape[1], batch.shape[2], batch.shape[3],
+                              batch.shape[4])
+        outputBatch = self.resnet(batch)
+        outputBatch = outputBatch.reshape(batchsize, -1, 512)
+        outputBatch = outputBatch.transpose(1, 2)
+        outputBatch = outputBatch.transpose(1, 2).transpose(0, 1)
+        return outputBatch
+class DSConv1d(nn.Module):
+    def __init__(self):
+        super(DSConv1d, self).__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Conv1d(512, 512, 3, stride=1, padding=1, dilation=1, groups=512, bias=False),
+            nn.PReLU(),
+            GlobalLayerNorm(512),
+            nn.Conv1d(512, 512, 1, bias=False),
+        )
+    def forward(self, x):
+        out = self.net(x)
+        return out + x
+class visualTCN(nn.Module):
+    def __init__(self):
+        super(visualTCN, self).__init__()
+        stacks = []
+        for x in range(5):
+            stacks += [DSConv1d()]
+        self.net = nn.Sequential(*stacks)    # Visual Temporal Network V-TCN
+    def forward(self, x):
+        out = self.net(x)
+        return out
+class visualConv1D(nn.Module):
+    def __init__(self):
+        super(visualConv1D, self).__init__()
+        self.net = nn.Sequential(
+            nn.Conv1d(512, 256, 5, stride=1, padding=2),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Conv1d(256, 128, 1),
+        )
+    def forward(self, x):
+        out = self.net(x)
+        return out