| import os |
| import sys |
|
|
| import torch |
|
|
| import torch.nn as nn |
|
|
| sys.path.append(os.getcwd()) |
|
|
| from infer.lib.predictors.RMVPE.yolo import YOLO13Encoder, YOLO13FullPADDecoder, HyperACE |
|
|
| class ConvBlockRes(nn.Module): |
| def __init__( |
| self, |
| in_channels, |
| out_channels, |
| momentum=0.01 |
| ): |
| super(ConvBlockRes, self).__init__() |
| self.conv = nn.Sequential( |
| nn.Conv2d( |
| in_channels=in_channels, |
| out_channels=out_channels, |
| kernel_size=(3, 3), |
| stride=(1, 1), |
| padding=(1, 1), |
| bias=False |
| ), |
| nn.BatchNorm2d( |
| out_channels, |
| momentum=momentum |
| ), |
| nn.ReLU(), |
| nn.Conv2d( |
| in_channels=out_channels, |
| out_channels=out_channels, |
| kernel_size=(3, 3), |
| stride=(1, 1), |
| padding=(1, 1), |
| bias=False |
| ), |
| nn.BatchNorm2d( |
| out_channels, |
| momentum=momentum |
| ), |
| nn.ReLU() |
| ) |
|
|
| if in_channels != out_channels: |
| self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) |
| self.is_shortcut = True |
| else: self.is_shortcut = False |
|
|
| def forward(self, x): |
| return ( |
| self.conv(x) + self.shortcut(x) |
| ) if self.is_shortcut else ( |
| self.conv(x) + x |
| ) |
|
|
| class ResEncoderBlock(nn.Module): |
| def __init__( |
| self, |
| in_channels, |
| out_channels, |
| kernel_size, |
| n_blocks=1, |
| momentum=0.01 |
| ): |
| super(ResEncoderBlock, self).__init__() |
| self.n_blocks = n_blocks |
| self.conv = nn.ModuleList() |
| self.conv.append( |
| ConvBlockRes( |
| in_channels, |
| out_channels, |
| momentum |
| ) |
| ) |
|
|
| for _ in range(n_blocks - 1): |
| self.conv.append( |
| ConvBlockRes( |
| out_channels, |
| out_channels, |
| momentum |
| ) |
| ) |
|
|
| self.kernel_size = kernel_size |
| if self.kernel_size is not None: self.pool = nn.AvgPool2d(kernel_size=kernel_size) |
|
|
| def forward(self, x): |
| for i in range(self.n_blocks): |
| x = self.conv[i](x) |
|
|
| if self.kernel_size is not None: return x, self.pool(x) |
| else: return x |
|
|
| class Encoder(nn.Module): |
| def __init__( |
| self, |
| in_channels, |
| in_size, |
| n_encoders, |
| kernel_size, |
| n_blocks, |
| out_channels=16, |
| momentum=0.01 |
| ): |
| super(Encoder, self).__init__() |
| self.n_encoders = n_encoders |
| self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) |
| self.layers = nn.ModuleList() |
|
|
| for _ in range(self.n_encoders): |
| self.layers.append( |
| ResEncoderBlock( |
| in_channels, |
| out_channels, |
| kernel_size, |
| n_blocks, |
| momentum=momentum |
| ) |
| ) |
|
|
| in_channels = out_channels |
| out_channels *= 2 |
| in_size //= 2 |
| |
| self.out_size = in_size |
| self.out_channel = out_channels |
|
|
| def forward(self, x): |
| concat_tensors = [] |
| x = self.bn(x) |
|
|
| for layer in self.layers: |
| t, x = layer(x) |
| concat_tensors.append(t) |
|
|
| return x, concat_tensors |
|
|
| class Intermediate(nn.Module): |
| def __init__( |
| self, |
| in_channels, |
| out_channels, |
| n_inters, |
| n_blocks, |
| momentum=0.01 |
| ): |
| super(Intermediate, self).__init__() |
| self.layers = nn.ModuleList() |
| self.layers.append( |
| ResEncoderBlock( |
| in_channels, |
| out_channels, |
| None, |
| n_blocks, |
| momentum |
| ) |
| ) |
|
|
| for _ in range(n_inters - 1): |
| self.layers.append( |
| ResEncoderBlock( |
| out_channels, |
| out_channels, |
| None, |
| n_blocks, |
| momentum |
| ) |
| ) |
|
|
| def forward(self, x): |
| for layer in self.layers: |
| x = layer(x) |
|
|
| return x |
|
|
| class ResDecoderBlock(nn.Module): |
| def __init__( |
| self, |
| in_channels, |
| out_channels, |
| stride, |
| n_blocks=1, |
| momentum=0.01 |
| ): |
| super(ResDecoderBlock, self).__init__() |
| out_padding = (0, 1) if stride == (1, 2) else (1, 1) |
| self.conv1 = nn.Sequential( |
| nn.ConvTranspose2d( |
| in_channels=in_channels, |
| out_channels=out_channels, |
| kernel_size=(3, 3), |
| stride=stride, |
| padding=(1, 1), |
| output_padding=out_padding, |
| bias=False |
| ), |
| nn.BatchNorm2d( |
| out_channels, |
| momentum=momentum |
| ), |
| nn.ReLU() |
| ) |
|
|
| self.conv2 = nn.ModuleList() |
| self.conv2.append( |
| ConvBlockRes( |
| out_channels * 2, |
| out_channels, |
| momentum |
| ) |
| ) |
|
|
| for _ in range(n_blocks - 1): |
| self.conv2.append( |
| ConvBlockRes( |
| out_channels, |
| out_channels, |
| momentum |
| ) |
| ) |
|
|
| def forward(self, x, concat_tensor): |
| x = torch.cat((self.conv1(x), concat_tensor), dim=1) |
| for conv2 in self.conv2: |
| x = conv2(x) |
|
|
| return x |
|
|
| class Decoder(nn.Module): |
| def __init__( |
| self, |
| in_channels, |
| n_decoders, |
| stride, |
| n_blocks, |
| momentum=0.01 |
| ): |
| super(Decoder, self).__init__() |
| self.layers = nn.ModuleList() |
|
|
| for _ in range(n_decoders): |
| out_channels = in_channels // 2 |
| self.layers.append( |
| ResDecoderBlock( |
| in_channels, |
| out_channels, |
| stride, |
| n_blocks, |
| momentum |
| ) |
| ) |
| in_channels = out_channels |
|
|
| def forward(self, x, concat_tensors): |
| for i, layer in enumerate(self.layers): |
| x = layer(x, concat_tensors[-1 - i]) |
|
|
| return x |
|
|
| class DeepUnet(nn.Module): |
| def __init__( |
| self, |
| kernel_size, |
| n_blocks, |
| en_de_layers=5, |
| inter_layers=4, |
| in_channels=1, |
| en_out_channels=16 |
| ): |
| super(DeepUnet, self).__init__() |
| self.encoder = Encoder( |
| in_channels, |
| 128, |
| en_de_layers, |
| kernel_size, |
| n_blocks, |
| en_out_channels |
| ) |
| self.intermediate = Intermediate( |
| self.encoder.out_channel // 2, |
| self.encoder.out_channel, |
| inter_layers, |
| n_blocks |
| ) |
| self.decoder = Decoder( |
| self.encoder.out_channel, |
| en_de_layers, |
| kernel_size, |
| n_blocks |
| ) |
|
|
| def forward(self, x): |
| x, concat_tensors = self.encoder(x) |
|
|
| return self.decoder( |
| self.intermediate(x), |
| concat_tensors |
| ) |
| |
| class HPADeepUnet(nn.Module): |
| def __init__( |
| self, |
| in_channels=1, |
| en_out_channels=16, |
| base_channels=64, |
| hyperace_k=2, |
| hyperace_l=1, |
| num_hyperedges=16, |
| num_heads=8 |
| ): |
| super().__init__() |
| self.encoder = YOLO13Encoder( |
| in_channels, |
| base_channels |
| ) |
|
|
| enc_ch = self.encoder.out_channels |
|
|
| self.hyperace = HyperACE( |
| in_channels=enc_ch, |
| out_channels=enc_ch[-1], |
| num_hyperedges=num_hyperedges, |
| num_heads=num_heads, |
| k=hyperace_k, |
| l=hyperace_l |
| ) |
|
|
| self.decoder = YOLO13FullPADDecoder( |
| encoder_channels=enc_ch, |
| hyperace_out_c=enc_ch[-1], |
| out_channels_final=en_out_channels |
| ) |
|
|
| def forward(self, x): |
| features = self.encoder(x) |
|
|
| return nn.functional.interpolate( |
| self.decoder( |
| features, |
| self.hyperace(features) |
| ), |
| size=x.shape[2:], |
| mode='bilinear', |
| align_corners=False |
| ) |