| """VGG2L module definition for transformer encoder.""" | |
| from typing import Tuple | |
| from typing import Union | |
| import torch | |
| class VGG2L(torch.nn.Module): | |
| """VGG2L module for custom encoder. | |
| Args: | |
| idim: Dimension of inputs | |
| odim: Dimension of outputs | |
| pos_enc: Positional encoding class | |
| """ | |
| def __init__(self, idim: int, odim: int, pos_enc: torch.nn.Module = None): | |
| """Construct a VGG2L object.""" | |
| super().__init__() | |
| self.vgg2l = torch.nn.Sequential( | |
| torch.nn.Conv2d(1, 64, 3, stride=1, padding=1), | |
| torch.nn.ReLU(), | |
| torch.nn.Conv2d(64, 64, 3, stride=1, padding=1), | |
| torch.nn.ReLU(), | |
| torch.nn.MaxPool2d((3, 2)), | |
| torch.nn.Conv2d(64, 128, 3, stride=1, padding=1), | |
| torch.nn.ReLU(), | |
| torch.nn.Conv2d(128, 128, 3, stride=1, padding=1), | |
| torch.nn.ReLU(), | |
| torch.nn.MaxPool2d((2, 2)), | |
| ) | |
| if pos_enc is not None: | |
| self.output = torch.nn.Sequential( | |
| torch.nn.Linear(128 * ((idim // 2) // 2), odim), pos_enc | |
| ) | |
| else: | |
| self.output = torch.nn.Linear(128 * ((idim // 2) // 2), odim) | |
| def forward( | |
| self, x: torch.Tensor, x_mask: torch.Tensor | |
| ) -> Union[ | |
| Tuple[torch.Tensor, torch.Tensor], | |
| Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], | |
| ]: | |
| """VGG2L forward for x. | |
| Args: | |
| x: Input tensor (B, T, idim) | |
| x_mask: Input mask (B, 1, T) | |
| Returns: | |
| x: Output tensor (B, sub(T), odim) | |
| or ((B, sub(T), odim), (B, sub(T), att_dim)) | |
| x_mask: Output mask (B, 1, sub(T)) | |
| """ | |
| x = x.unsqueeze(1) | |
| x = self.vgg2l(x) | |
| b, c, t, f = x.size() | |
| x = self.output(x.transpose(1, 2).contiguous().view(b, t, c * f)) | |
| if x_mask is not None: | |
| x_mask = self.create_new_mask(x_mask) | |
| return x, x_mask | |
| def create_new_mask(self, x_mask: torch.Tensor) -> torch.Tensor: | |
| """Create a subsampled version of x_mask. | |
| Args: | |
| x_mask: Input mask (B, 1, T) | |
| Returns: | |
| x_mask: Output mask (B, 1, sub(T)) | |
| """ | |
| x_t1 = x_mask.size(2) - (x_mask.size(2) % 3) | |
| x_mask = x_mask[:, :, :x_t1][:, :, ::3] | |
| x_t2 = x_mask.size(2) - (x_mask.size(2) % 2) | |
| x_mask = x_mask[:, :, :x_t2][:, :, ::2] | |
| return x_mask | |