Spaces:

mattricesound
/

RemFx

Runtime error

App Files Files Community

mattricesound commited on Mar 13, 2023

Commit

7d6f241

•

1 Parent(s): e4fc05d

Init dcunet and dptnet

Browse files

Files changed (13) hide show

cfg/model/audio_diffusion.yaml +2 -2
cfg/model/classifier.yaml +1 -1
cfg/model/dcunet.yaml +22 -0
cfg/model/demucs.yaml +1 -1
cfg/model/dptnet.yaml +18 -0
cfg/model/umx.yaml +2 -2
remfx/cnn14.py +138 -0
remfx/datasets.py +1 -0
remfx/dcunet.py +649 -0
remfx/dptnet.py +460 -0
remfx/models.py +39 -199
remfx/utils.py +92 -0
scripts/test.py +0 -1

cfg/model/audio_diffusion.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
-model:
-  _target_: remfx.models.RemFXModel
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999

 # @package _global_
+model:
+  _target_: remfx.models.RemFx
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999

cfg/model/classifier.yaml CHANGED Viewed

@@ -5,7 +5,7 @@ model:
   lr_weight_decay: 1e-3
   sample_rate: ${sample_rate}
   network:
-    _target_: remfx.models.Cnn14
     num_classes: ${num_classes}
     n_fft: 4096
     hop_length: 512

   lr_weight_decay: 1e-3
   sample_rate: ${sample_rate}
   network:
+    _target_: remfx.cnn14.Cnn14
     num_classes: ${num_classes}
     n_fft: 4096
     hop_length: 512

cfg/model/dcunet.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+# @package _global_
+model:
+  _target_: remfx.models.RemFx
+  lr: 1e-4
+  lr_beta1: 0.95
+  lr_beta2: 0.999
+  lr_eps: 1e-6
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  network:
+    _target_: remfx.models.DCUNetModel
+    spec_dim: 256 + 1
+    hidden_dim: 768
+    filter_len: 512
+    hop_len: 64
+    block_layers: 4
+    layers: 4
+    kernel_size: 3
+    refine_layers: 1
+    is_mask: True
+    norm: 'ins'
+    act: 'comp'

cfg/model/demucs.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
 model:
-  _target_: remfx.models.RemFXModel
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999

 # @package _global_
 model:
+  _target_: remfx.models.RemFx
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999

cfg/model/dptnet.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# @package _global_
+model:
+  _target_: remfx.models.RemFx
+  lr: 1e-4
+  lr_beta1: 0.95
+  lr_beta2: 0.999
+  lr_eps: 1e-6
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  network:
+    _target_: remfx.models.DPTNetModel
+    enc_dim: 256
+    feature_dim: 64
+    hidden_dim: 128
+    layer: 6
+    segment_size: 250
+    nspk: 1
+    win_len: 2

cfg/model/umx.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
-model:
-  _target_: remfx.models.RemFXModel
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999

 # @package _global_
+model:
+  _target_: remfx.models.RemFx
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999

remfx/cnn14.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import torchaudio
+import torch.nn as nn
+import torch.nn.functional as F
+from utils import init_bn, init_layer
+# adapted from https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
+class Cnn14(nn.Module):
+    def __init__(
+        self,
+        num_classes: int,
+        sample_rate: float,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        n_mels: int = 128,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        window = torch.hann_window(n_fft)
+        self.register_buffer("window", window)
+        self.melspec = torchaudio.transforms.MelSpectrogram(
+            sample_rate,
+            n_fft,
+            hop_length=hop_length,
+            n_mels=n_mels,
+        )
+        self.bn0 = nn.BatchNorm2d(n_mels)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        self.fc1 = nn.Linear(2048, 2048, bias=True)
+        self.fc_audioset = nn.Linear(2048, num_classes, bias=True)
+        self.init_weight()
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+    def forward(self, x: torch.Tensor):
+        """
+        Input: (batch_size, data_length)"""
+        x = self.melspec(x)
+        x = x.permute(0, 2, 1, 3)
+        x = self.bn0(x)
+        x = x.permute(0, 2, 1, 3)
+        if self.training:
+            pass
+            # x = self.spec_augmenter(x)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        clipwise_output = self.fc_audioset(x)
+        return clipwise_output
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+    def forward(self, input, pool_size=(2, 2), pool_type="avg"):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == "max":
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg":
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg+max":
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception("Incorrect argument!")
+        return x

remfx/datasets.py CHANGED Viewed

@@ -250,6 +250,7 @@ class VocalSet(Dataset):
         # Normalize
         normalized_dry = self.normalize(dry)
         normalized_wet = self.normalize(wet)
         return normalized_dry, normalized_wet, dry_labels_tensor, wet_labels_tensor

         # Normalize
         normalized_dry = self.normalize(dry)
         normalized_wet = self.normalize(wet)
         return normalized_dry, normalized_wet, dry_labels_tensor, wet_labels_tensor

remfx/dcunet.py ADDED Viewed

	@@ -0,0 +1,649 @@

+# Adapted from https://github.com/AppleHolic/source_separation/tree/master/source_separation
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from utils import single, concat_complex
+from torch.nn.init import calculate_gain
+from typing import Tuple
+from scipy.signal import get_window
+from librosa.util import pad_center
+class ComplexConvBlock(nn.Module):
+    """
+    Convolution block
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        padding: int = 0,
+        layers: int = 4,
+        bn_func=nn.BatchNorm1d,
+        act_func=nn.LeakyReLU,
+        skip_res: bool = False,
+    ):
+        super().__init__()
+        # modules
+        self.blocks = nn.ModuleList()
+        self.skip_res = skip_res
+        for idx in range(layers):
+            in_ = in_channels if idx == 0 else out_channels
+            self.blocks.append(
+                nn.Sequential(
+                    *[
+                        bn_func(in_),
+                        act_func(),
+                        ComplexConv1d(in_, out_channels, kernel_size, padding=padding),
+                    ]
+                )
+            )
+    def forward(self, x: torch.tensor) -> torch.tensor:
+        temp = x
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+        if temp.size() != x.size() or self.skip_res:
+            return x
+        else:
+            return x + temp
+class SpectrogramUnet(nn.Module):
+    def __init__(
+        self,
+        spec_dim: int,
+        hidden_dim: int,
+        filter_len: int,
+        hop_len: int,
+        layers: int = 3,
+        block_layers: int = 3,
+        kernel_size: int = 5,
+        is_mask: bool = False,
+        norm: str = "bn",
+        act: str = "tanh",
+    ):
+        super().__init__()
+        self.layers = layers
+        self.is_mask = is_mask
+        # stft modules
+        self.stft = STFT(filter_len, hop_len)
+        if norm == "bn":
+            self.bn_func = nn.BatchNorm1d
+        elif norm == "ins":
+            self.bn_func = lambda x: nn.InstanceNorm1d(x, affine=True)
+        else:
+            raise NotImplementedError("{} is not implemented !".format(norm))
+        if act == "tanh":
+            self.act_func = nn.Tanh
+            self.act_out = nn.Tanh
+        elif act == "comp":
+            self.act_func = ComplexActLayer
+            self.act_out = lambda: ComplexActLayer(is_out=True)
+        else:
+            raise NotImplementedError("{} is not implemented !".format(act))
+        # prev conv
+        self.prev_conv = ComplexConv1d(spec_dim * 2, hidden_dim, 1)
+        # down
+        self.down = nn.ModuleList()
+        self.down_pool = nn.MaxPool1d(3, stride=2, padding=1)
+        for idx in range(self.layers):
+            block = ComplexConvBlock(
+                hidden_dim,
+                hidden_dim,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                bn_func=self.bn_func,
+                act_func=self.act_func,
+                layers=block_layers,
+            )
+            self.down.append(block)
+        # up
+        self.up = nn.ModuleList()
+        for idx in range(self.layers):
+            in_c = hidden_dim if idx == 0 else hidden_dim * 2
+            self.up.append(
+                nn.Sequential(
+                    ComplexConvBlock(
+                        in_c,
+                        hidden_dim,
+                        kernel_size=kernel_size,
+                        padding=kernel_size // 2,
+                        bn_func=self.bn_func,
+                        act_func=self.act_func,
+                        layers=block_layers,
+                    ),
+                    self.bn_func(hidden_dim),
+                    self.act_func(),
+                    ComplexTransposedConv1d(
+                        hidden_dim, hidden_dim, kernel_size=2, stride=2
+                    ),
+                )
+            )
+        # out_conv
+        self.out_conv = nn.Sequential(
+            ComplexConvBlock(
+                hidden_dim * 2,
+                spec_dim * 2,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                bn_func=self.bn_func,
+                act_func=self.act_func,
+            ),
+            self.bn_func(spec_dim * 2),
+            self.act_func(),
+        )
+        # refine conv
+        self.refine_conv = nn.Sequential(
+            ComplexConvBlock(
+                spec_dim * 4,
+                spec_dim * 2,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                bn_func=self.bn_func,
+                act_func=self.act_func,
+            ),
+            self.bn_func(spec_dim * 2),
+            self.act_func(),
+        )
+    def log_stft(self, wav):
+        # stft
+        mag, phase = self.stft.transform(wav)
+        return torch.log(mag + 1), phase
+    def exp_istft(self, log_mag, phase):
+        # exp
+        mag = np.e**log_mag - 1
+        # istft
+        wav = self.stft.inverse(mag, phase)
+        return wav
+    def adjust_diff(self, x, target):
+        size_diff = target.size()[-1] - x.size()[-1]
+        assert size_diff >= 0
+        if size_diff > 0:
+            x = F.pad(
+                x.unsqueeze(1), (size_diff // 2, size_diff // 2), "reflect"
+            ).squeeze(1)
+        return x
+    def masking(self, mag, phase, origin_mag, origin_phase):
+        abs_mag = torch.abs(mag)
+        mag_mask = torch.tanh(abs_mag)
+        phase_mask = mag / abs_mag
+        # masking
+        mag = mag_mask * origin_mag
+        phase = phase_mask * (origin_phase + phase)
+        return mag, phase
+    def forward(self, wav):
+        # stft
+        origin_mag, origin_phase = self.log_stft(wav)
+        origin_x = torch.cat([origin_mag, origin_phase], dim=1)
+        # prev
+        x = self.prev_conv(origin_x)
+        # body
+        # down
+        down_cache = []
+        for idx, block in enumerate(self.down):
+            x = block(x)
+            down_cache.append(x)
+            x = self.down_pool(x)
+        # up
+        for idx, block in enumerate(self.up):
+            x = block(x)
+            res = F.interpolate(
+                down_cache[self.layers - (idx + 1)],
+                size=[x.size()[2]],
+                mode="linear",
+                align_corners=False,
+            )
+            x = concat_complex(x, res, dim=1)
+        # match spec dimension
+        x = self.out_conv(x)
+        if origin_mag.size(2) != x.size(2):
+            x = F.interpolate(
+                x, size=[origin_mag.size(2)], mode="linear", align_corners=False
+            )
+        # refine
+        x = self.refine_conv(concat_complex(x, origin_x))
+        def to_wav(stft):
+            mag, phase = stft.chunk(2, 1)
+            if self.is_mask:
+                mag, phase = self.masking(mag, phase, origin_mag, origin_phase)
+            out = self.exp_istft(mag, phase)
+            out = self.adjust_diff(out, wav)
+            return out
+        refine_wav = to_wav(x)
+        return refine_wav
+class RefineSpectrogramUnet(SpectrogramUnet):
+    def __init__(
+        self,
+        spec_dim: int,
+        hidden_dim: int,
+        filter_len: int,
+        hop_len: int,
+        layers: int = 4,
+        block_layers: int = 4,
+        kernel_size: int = 3,
+        is_mask: bool = True,
+        norm: str = "ins",
+        act: str = "comp",
+        refine_layers: int = 1,
+        add_spec_results: bool = False,
+    ):
+        super().__init__(
+            spec_dim,
+            hidden_dim,
+            filter_len,
+            hop_len,
+            layers,
+            block_layers,
+            kernel_size,
+            is_mask,
+            norm,
+            act,
+        )
+        self.add_spec_results = add_spec_results
+        # refine conv
+        self.refine_conv = nn.ModuleList(
+            [
+                nn.Sequential(
+                    ComplexConvBlock(
+                        spec_dim * 2,
+                        spec_dim * 2,
+                        kernel_size=kernel_size,
+                        padding=kernel_size // 2,
+                        bn_func=self.bn_func,
+                        act_func=self.act_func,
+                    ),
+                    self.bn_func(spec_dim * 2),
+                    self.act_func(),
+                )
+            ]
+            * refine_layers
+        )
+    def forward(self, wav):
+        # stft
+        origin_mag, origin_phase = self.log_stft(wav)
+        origin_x = torch.cat([origin_mag, origin_phase], dim=1)
+        # prev
+        x = self.prev_conv(origin_x)
+        # body
+        # down
+        down_cache = []
+        for idx, block in enumerate(self.down):
+            x = block(x)
+            down_cache.append(x)
+            x = self.down_pool(x)
+        # up
+        for idx, block in enumerate(self.up):
+            x = block(x)
+            res = F.interpolate(
+                down_cache[self.layers - (idx + 1)],
+                size=[x.size()[2]],
+                mode="linear",
+                align_corners=False,
+            )
+            x = concat_complex(x, res, dim=1)
+        # match spec dimension
+        x = self.out_conv(x)
+        if origin_mag.size(2) != x.size(2):
+            x = F.interpolate(
+                x, size=[origin_mag.size(2)], mode="linear", align_corners=False
+            )
+        # refine
+        for idx, refine_module in enumerate(self.refine_conv):
+            x = refine_module(x)
+            mag, phase = x.chunk(2, 1)
+            mag, phase = self.masking(mag, phase, origin_mag, origin_phase)
+            if idx < len(self.refine_conv) - 1:
+                x = torch.cat([mag, phase], dim=1)
+        # clamp phase
+        phase = phase.clamp(-np.pi, np.pi)
+        out = self.exp_istft(mag, phase)
+        out = self.adjust_diff(out, wav)
+        if self.add_spec_results:
+            out = (out, mag, phase)
+        return out
+class _ComplexConvNd(nn.Module):
+    """
+    Implement Complex Convolution
+    A: real weight
+    B: img weight
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.output_padding = output_padding
+        self.transposed = transposed
+        self.A = self.make_weight(in_channels, out_channels, kernel_size)
+        self.B = self.make_weight(in_channels, out_channels, kernel_size)
+        self.reset_parameters()
+    def make_weight(self, in_ch, out_ch, kernel_size):
+        if self.transposed:
+            tensor = nn.Parameter(torch.Tensor(in_ch, out_ch // 2, *kernel_size))
+        else:
+            tensor = nn.Parameter(torch.Tensor(out_ch, in_ch // 2, *kernel_size))
+        return tensor
+    def reset_parameters(self):
+        # init real weight
+        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.A)
+        # init A
+        gain = calculate_gain("leaky_relu", 0)
+        std = gain / np.sqrt(fan_in)
+        bound = np.sqrt(3.0) * std
+        with torch.no_grad():
+            # TODO: find more stable initial values
+            self.A.uniform_(-bound * (1 / (np.pi**2)), bound * (1 / (np.pi**2)))
+            #
+            # B is initialized by pi
+            # -pi and pi is too big, so it is powed by -1
+            self.B.uniform_(-1 / np.pi, 1 / np.pi)
+class ComplexConv1d(_ComplexConvNd):
+    """
+    Complex Convolution 1d
+    """
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1
+    ):
+        kernel_size = single(kernel_size)
+        stride = single(stride)
+        # edit padding
+        padding = padding
+        dilation = single(dilation)
+        super(ComplexConv1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            single(0),
+        )
+    def forward(self, x):
+        """
+        Implemented complex convolution using combining 'grouped convolution' and
+        'real / img weight'
+        :param x: data (N, C, T) C is concatenated with C/2 real channels and C/2 idea channels
+        :return: complex conved result
+        """
+        # adopt reflect padding
+        if self.padding:
+            x = F.pad(x, (self.padding, self.padding), "reflect")
+        # forward real
+        real_part = F.conv1d(
+            x,
+            self.A,
+            None,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=2,
+        )
+        # forward idea
+        spl = self.in_channels // 2
+        weight_B = torch.cat([self.B[:spl].data * (-1), self.B[spl:].data])
+        idea_part = F.conv1d(
+            x,
+            weight_B,
+            None,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=2,
+        )
+        return real_part + idea_part
+class ComplexTransposedConv1d(_ComplexConvNd):
+    """
+    Complex Transposed Convolution 1d
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        dilation=1,
+    ):
+        kernel_size = single(kernel_size)
+        stride = single(stride)
+        padding = padding
+        dilation = single(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+        )
+    def forward(self, x, output_size=None):
+        """
+        Implemented complex transposed convolution using combining 'grouped convolution'
+        and 'real / img weight'
+        :param x: data (N, C, T) C is concatenated with C/2 real channels and C/2 idea channels
+        :return: complex transposed convolution result
+        """
+        # forward real
+        if self.padding:
+            x = F.pad(x, (self.padding, self.padding), "reflect")
+        real_part = F.conv_transpose1d(
+            x,
+            self.A,
+            None,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=2,
+        )
+        # forward idea
+        spl = self.out_channels // 2
+        weight_B = torch.cat([self.B[:spl] * (-1), self.B[spl:]])
+        idea_part = F.conv_transpose1d(
+            x,
+            weight_B,
+            None,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=2,
+        )
+        if self.output_padding:
+            real_part = F.pad(
+                real_part, (self.output_padding, self.output_padding), "reflect"
+            )
+            idea_part = F.pad(
+                idea_part, (self.output_padding, self.output_padding), "reflect"
+            )
+        return real_part + idea_part
+class ComplexActLayer(nn.Module):
+    """
+    Activation differently 'real' part and 'img' part
+    In implemented DCUnet on this repository, Real part is activated to log space.
+    And Phase(img) part, it is distributed in [-pi, pi]...
+    """
+    def forward(self, x):
+        real, img = x.chunk(2, 1)
+        return torch.cat([F.leaky_relu_(real), torch.tanh(img) * np.pi], dim=1)
+class STFT(nn.Module):
+    """
+    Re-construct stft for calculating backward operation
+    refer on : https://github.com/pseeth/torch-stft/blob/master/torch_stft/stft.py
+    """
+    def __init__(
+        self,
+        filter_length: int = 1024,
+        hop_length: int = 512,
+        win_length: int = None,
+        window: str = "hann",
+    ):
+        super().__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length if win_length else filter_length
+        self.window = window
+        self.pad_amount = self.filter_length // 2
+        # make fft window
+        assert filter_length >= self.win_length
+        # get window and zero center pad it to filter_length
+        fft_window = get_window(window, self.win_length, fftbins=True)
+        fft_window = pad_center(fft_window, filter_length)
+        fft_window = torch.from_numpy(fft_window).float()
+        # calculate fourer_basis
+        cut_off = int((self.filter_length / 2 + 1))
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cut_off, :]), np.imag(fourier_basis[:cut_off, :])]
+        )
+        # make forward & inverse basis
+        self.register_buffer("square_window", fft_window**2)
+        forward_basis = torch.FloatTensor(fourier_basis[:, np.newaxis, :]) * fft_window
+        inverse_basis = (
+            torch.FloatTensor(
+                np.linalg.pinv(self.filter_length / self.hop_length * fourier_basis).T[
+                    :, np.newaxis, :
+                ]
+            )
+            * fft_window
+        )
+        # torch.pinverse has a bug, so at this time, it is separated into two parts..
+        self.register_buffer("forward_basis", forward_basis)
+        self.register_buffer("inverse_basis", inverse_basis)
+    def transform(self, wav: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # reflect padding
+        wav = wav.unsqueeze(1).unsqueeze(1)
+        wav = F.pad(
+            wav, (self.pad_amount, self.pad_amount, 0, 0), mode="reflect"
+        ).squeeze(1)
+        # conv
+        forward_trans = F.conv1d(
+            wav, self.forward_basis, stride=self.hop_length, padding=0
+        )
+        real_part, imag_part = forward_trans.chunk(2, 1)
+        return torch.sqrt(real_part**2 + imag_part**2), torch.atan2(
+            imag_part.data, real_part.data
+        )
+    def inverse(
+        self, magnitude: torch.Tensor, phase: torch.Tensor, eps: float = 1e-9
+    ) -> torch.Tensor:
+        comp = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        inverse_transform = F.conv_transpose1d(
+            comp, self.inverse_basis, stride=self.hop_length, padding=0
+        )
+        # remove window effect
+        n_frames = comp.size(-1)
+        inverse_size = inverse_transform.size(-1)
+        window_filter = torch.ones(1, 1, n_frames).type_as(inverse_transform)
+        weight = self.square_window[: self.filter_length].unsqueeze(0).unsqueeze(0)
+        window_filter = F.conv_transpose1d(
+            window_filter, weight, stride=self.hop_length, padding=0
+        )
+        window_filter = window_filter.squeeze()[:inverse_size] + eps
+        inverse_transform /= window_filter
+        # scale by hop ratio
+        inverse_transform *= self.filter_length / self.hop_length
+        return inverse_transform[..., self.pad_amount : -self.pad_amount].squeeze(1)

remfx/dptnet.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.container import ModuleList
+from torch.nn.modules.activation import MultiheadAttention
+from torch.nn.modules.dropout import Dropout
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.rnn import LSTM
+from torch.nn.modules.normalization import LayerNorm
+from torch.autograd import Variable
+import copy
+import math
+# adapted from https://github.com/ujscjj/DPTNet
+class DPTNet_base(nn.Module):
+    def __init__(
+        self,
+        enc_dim,
+        feature_dim,
+        hidden_dim,
+        layer,
+        segment_size=250,
+        nspk=2,
+        win_len=2,
+    ):
+        super().__init__()
+        # parameters
+        self.window = win_len
+        self.stride = self.window // 2
+        self.enc_dim = enc_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.segment_size = segment_size
+        self.layer = layer
+        self.num_spk = nspk
+        self.eps = 1e-8
+        self.dpt_encoder = DPTEncoder(
+            n_filters=enc_dim,
+            window_size=win_len,
+        )
+        self.enc_LN = nn.GroupNorm(1, self.enc_dim, eps=1e-8)
+        self.dpt_separation = DPTSeparation(
+            self.enc_dim,
+            self.feature_dim,
+            self.hidden_dim,
+            self.num_spk,
+            self.layer,
+            self.segment_size,
+        )
+        self.mask_conv1x1 = nn.Conv1d(self.feature_dim, self.enc_dim, 1, bias=False)
+        self.decoder = DPTDecoder(n_filters=enc_dim, window_size=win_len)
+    def forward(self, batch):
+        """
+        mix: shape (batch, T)
+        """
+        mix, target = batch
+        batch_size = mix.shape[0]
+        mix = self.dpt_encoder(mix)  # (B, E, L)
+        score_ = self.enc_LN(mix)  # B, E, L
+        score_ = self.dpt_separation(score_)  # B, nspk, T, N
+        score_ = (
+            score_.view(batch_size * self.num_spk, -1, self.feature_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )  # B*nspk, N, T
+        score = self.mask_conv1x1(score_)  # [B*nspk, N, L] -> [B*nspk, E, L]
+        score = score.view(
+            batch_size, self.num_spk, self.enc_dim, -1
+        )  # [B*nspk, E, L] -> [B, nspk, E, L]
+        est_mask = F.relu(score)
+        est_source = self.decoder(
+            mix, est_mask
+        )  # [B, E, L] + [B, nspk, E, L]--> [B, nspk, T]
+        return est_source
+class DPTEncoder(nn.Module):
+    def __init__(self, n_filters: int = 64, window_size: int = 2):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            1, n_filters, kernel_size=window_size, stride=window_size // 2, bias=False
+        )
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        x = F.relu(self.conv(x))
+        return x
+class TransformerEncoderLayer(torch.nn.Module):
+    def __init__(
+        self, d_model, nhead, hidden_size, dim_feedforward, dropout, activation="relu"
+    ):
+        super(TransformerEncoderLayer, self).__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of improved part
+        self.lstm = LSTM(d_model, hidden_size, 1, bidirectional=True)
+        self.dropout = Dropout(dropout)
+        self.linear = Linear(hidden_size * 2, d_model)
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+    def __setstate__(self, state):
+        if "activation" not in state:
+            state["activation"] = F.relu
+        super(TransformerEncoderLayer, self).__setstate__(state)
+    def forward(self, src, src_mask=None, src_key_padding_mask=None):
+        r"""Pass the input through the encoder layer.
+        Args:
+            src: the sequnce to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        src2 = self.self_attn(
+            src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear(self.dropout(self.activation(self.lstm(src)[0])))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+def _get_clones(module, N):
+    return ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
+class SingleTransformer(nn.Module):
+    """
+    Container module for a single Transformer layer.
+    args: input_size: int, dimension of the input feature.
+    The input should have shape (batch, seq_len, input_size).
+    """
+    def __init__(self, input_size, hidden_size, dropout):
+        super(SingleTransformer, self).__init__()
+        self.transformer = TransformerEncoderLayer(
+            d_model=input_size,
+            nhead=4,
+            hidden_size=hidden_size,
+            dim_feedforward=hidden_size * 2,
+            dropout=dropout,
+        )
+    def forward(self, input):
+        # input shape: batch, seq, dim
+        output = input
+        transformer_output = (
+            self.transformer(output.permute(1, 0, 2).contiguous())
+            .permute(1, 0, 2)
+            .contiguous()
+        )
+        return transformer_output
+# dual-path transformer
+class DPT(nn.Module):
+    """
+    Deep dual-path transformer.
+    args:
+        input_size: int, dimension of the input feature. The input should have shape
+                    (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        num_layers: int, number of stacked Transformer layers. Default is 1.
+        dropout: float, dropout ratio. Default is 0.
+    """
+    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0):
+        super(DPT, self).__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+        # dual-path transformer
+        self.row_transformer = nn.ModuleList([])
+        self.col_transformer = nn.ModuleList([])
+        for i in range(num_layers):
+            self.row_transformer.append(
+                SingleTransformer(input_size, hidden_size, dropout)
+            )
+            self.col_transformer.append(
+                SingleTransformer(input_size, hidden_size, dropout)
+            )
+        # output layer
+        self.output = nn.Sequential(nn.PReLU(), nn.Conv2d(input_size, output_size, 1))
+    def forward(self, input):
+        # input shape: batch, N, dim1, dim2
+        # apply transformer on dim1 first and then dim2
+        # output shape: B, output_size, dim1, dim2
+        # input = input.to(device)
+        batch_size, _, dim1, dim2 = input.shape
+        output = input
+        for i in range(len(self.row_transformer)):
+            row_input = (
+                output.permute(0, 3, 2, 1)
+                .contiguous()
+                .view(batch_size * dim2, dim1, -1)
+            )  # B*dim2, dim1, N
+            row_output = self.row_transformer[i](row_input)  # B*dim2, dim1, H
+            row_output = (
+                row_output.view(batch_size, dim2, dim1, -1)
+                .permute(0, 3, 2, 1)
+                .contiguous()
+            )  # B, N, dim1, dim2
+            output = row_output
+            col_input = (
+                output.permute(0, 2, 3, 1)
+                .contiguous()
+                .view(batch_size * dim1, dim2, -1)
+            )  # B*dim1, dim2, N
+            col_output = self.col_transformer[i](col_input)  # B*dim1, dim2, H
+            col_output = (
+                col_output.view(batch_size, dim1, dim2, -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+            )  # B, N, dim1, dim2
+            output = col_output
+        output = self.output(output)  # B, output_size, dim1, dim2
+        return output
+# base module for deep DPT
+class DPT_base(nn.Module):
+    def __init__(
+        self, input_dim, feature_dim, hidden_dim, num_spk=2, layer=6, segment_size=250
+    ):
+        super(DPT_base, self).__init__()
+        self.input_dim = input_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.layer = layer
+        self.segment_size = segment_size
+        self.num_spk = num_spk
+        self.eps = 1e-8
+        # bottleneck
+        self.BN = nn.Conv1d(self.input_dim, self.feature_dim, 1, bias=False)
+        # DPT model
+        self.DPT = DPT(
+            self.feature_dim,
+            self.hidden_dim,
+            self.feature_dim * self.num_spk,
+            num_layers=layer,
+        )
+    def pad_segment(self, input, segment_size):
+        # input is the features: (B, N, T)
+        batch_size, dim, seq_len = input.shape
+        segment_stride = segment_size // 2
+        rest = segment_size - (segment_stride + seq_len % segment_size) % segment_size
+        if rest > 0:
+            pad = Variable(torch.zeros(batch_size, dim, rest)).type(input.type())
+            input = torch.cat([input, pad], 2)
+        pad_aux = Variable(torch.zeros(batch_size, dim, segment_stride)).type(
+            input.type()
+        )
+        input = torch.cat([pad_aux, input, pad_aux], 2)
+        return input, rest
+    def split_feature(self, input, segment_size):
+        # split the feature into chunks of segment size
+        # input is the features: (B, N, T)
+        input, rest = self.pad_segment(input, segment_size)
+        batch_size, dim, seq_len = input.shape
+        segment_stride = segment_size // 2
+        segments1 = (
+            input[:, :, :-segment_stride]
+            .contiguous()
+            .view(batch_size, dim, -1, segment_size)
+        )
+        segments2 = (
+            input[:, :, segment_stride:]
+            .contiguous()
+            .view(batch_size, dim, -1, segment_size)
+        )
+        segments = (
+            torch.cat([segments1, segments2], 3)
+            .view(batch_size, dim, -1, segment_size)
+            .transpose(2, 3)
+        )
+        return segments.contiguous(), rest
+    def merge_feature(self, input, rest):
+        # merge the splitted features into full utterance
+        # input is the features: (B, N, L, K)
+        batch_size, dim, segment_size, _ = input.shape
+        segment_stride = segment_size // 2
+        input = (
+            input.transpose(2, 3)
+            .contiguous()
+            .view(batch_size, dim, -1, segment_size * 2)
+        )  # B, N, K, L
+        input1 = (
+            input[:, :, :, :segment_size]
+            .contiguous()
+            .view(batch_size, dim, -1)[:, :, segment_stride:]
+        )
+        input2 = (
+            input[:, :, :, segment_size:]
+            .contiguous()
+            .view(batch_size, dim, -1)[:, :, :-segment_stride]
+        )
+        output = input1 + input2
+        if rest > 0:
+            output = output[:, :, :-rest]
+        return output.contiguous()  # B, N, T
+    def forward(self, input):
+        pass
+class DPTSeparation(DPT_base):
+    def __init__(self, *args, **kwargs):
+        super(DPTSeparation, self).__init__(*args, **kwargs)
+        # gated output layer
+        self.output = nn.Sequential(
+            nn.Conv1d(self.feature_dim, self.feature_dim, 1), nn.Tanh()
+        )
+        self.output_gate = nn.Sequential(
+            nn.Conv1d(self.feature_dim, self.feature_dim, 1), nn.Sigmoid()
+        )
+    def forward(self, input):
+        # input = input.to(device)
+        # input: (B, E, T)
+        batch_size, E, seq_length = input.shape
+        enc_feature = self.BN(input)  # (B, E, L)-->(B, N, L)
+        # split the encoder output into overlapped, longer segments
+        enc_segments, enc_rest = self.split_feature(
+            enc_feature, self.segment_size
+        )  # B, N, L, K: L is the segment_size
+        # print('enc_segments.shape {}'.format(enc_segments.shape))
+        # pass to DPT
+        output = self.DPT(enc_segments).view(
+            batch_size * self.num_spk, self.feature_dim, self.segment_size, -1
+        )  # B*nspk, N, L, K
+        # overlap-and-add of the outputs
+        output = self.merge_feature(output, enc_rest)  # B*nspk, N, T
+        # gated output layer for filter generation
+        bf_filter = self.output(output) * self.output_gate(output)  # B*nspk, K, T
+        bf_filter = (
+            bf_filter.transpose(1, 2)
+            .contiguous()
+            .view(batch_size, self.num_spk, -1, self.feature_dim)
+        )  # B, nspk, T, N
+        return bf_filter
+class DPTDecoder(nn.Module):
+    def __init__(self, n_filters: int = 64, window_size: int = 2):
+        super().__init__()
+        self.W = window_size
+        self.basis_signals = nn.Linear(n_filters, window_size, bias=False)
+    def forward(self, mixture, mask):
+        """
+        mixture: (batch, n_filters, L)
+        mask: (batch, sources, n_filters, L)
+        """
+        source_w = torch.unsqueeze(mixture, 1) * mask  # [B, C, E, L]
+        source_w = torch.transpose(source_w, 2, 3)  # [B, C, L, E]
+        # S = DV
+        est_source = self.basis_signals(source_w)  # [B, C, L, W]
+        est_source = overlap_and_add(est_source, self.W // 2)  # B x C x T
+        return est_source
+def overlap_and_add(signal, frame_step):
+    """Reconstructs a signal from a framed representation.
+    Adds potentially overlapping frames of a signal with shape
+    `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
+    The resulting tensor has shape `[..., output_size]` where
+        output_size = (frames - 1) * frame_step + frame_length
+    Args:
+        signal: A [..., frames, frame_length] Tensor.
+        All dimensions may be unknown, and rank must be at least 2.
+        frame_step: An integer denoting overlap offsets. Must be less than or equal to frame_length.
+    Returns:
+        A Tensor with shape [..., output_size] containing the overlap-added frames of signal's
+        inner-most two dimensions.
+        output_size = (frames - 1) * frame_step + frame_length
+    Based on https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+    """
+    outer_dimensions = signal.size()[:-2]
+    frames, frame_length = signal.size()[-2:]
+    subframe_length = math.gcd(frame_length, frame_step)  # gcd=Greatest Common Divisor
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+    subframe_signal = signal.reshape(*outer_dimensions, -1, subframe_length)
+    frame = torch.arange(0, output_subframes).unfold(
+        0, subframes_per_frame, subframe_step
+    )
+    frame = signal.new_tensor(frame).long()  # signal may in GPU or CPU
+    frame = frame.contiguous().view(-1)
+    result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
+    result.index_add_(-2, frame, subframe_signal)
+    result = result.view(*outer_dimensions, -1)
+    return result

remfx/models.py CHANGED Viewed

@@ -1,10 +1,6 @@
-import wandb
 import torch
-import torchaudio
 import torchmetrics
 import pytorch_lightning as pl
-import torch.nn.functional as F
 from torch import Tensor, nn
 from einops import rearrange
 from torchaudio.models import HDemucs
@@ -13,10 +9,12 @@ from auraloss.time import SISDRLoss
 from auraloss.freq import MultiResolutionSTFTLoss
 from umx.openunmix.model import OpenUnmix, Separator
-from remfx.utils import FADLoss
-class RemFXModel(pl.LightningModule):
     def __init__(
         self,
         lr: float,
@@ -35,7 +33,7 @@ class RemFXModel(pl.LightningModule):
         self.lr_weight_decay = lr_weight_decay
         self.sample_rate = sample_rate
         self.model = network
-        self.metrics = torch.nn.ModuleDict(
             {
                 "SISDR": SISDRLoss(),
                 "STFT": MultiResolutionSTFTLoss(),
@@ -94,7 +92,8 @@ class RemFXModel(pl.LightningModule):
         return loss
     def common_step(self, batch, batch_idx, mode: str = "train"):
-        x, y, _, _ = batch
         loss, output = self.model((x, y))
         self.log(f"{mode}_loss", loss)
         # Metric logging
@@ -201,7 +200,7 @@ class RemFXModel(pl.LightningModule):
         )
-class OpenUnmixModel(torch.nn.Module):
     def __init__(
         self,
         n_fft: int = 2048,
@@ -234,7 +233,7 @@ class OpenUnmixModel(torch.nn.Module):
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=self.sample_rate
         )
-        self.l1loss = torch.nn.L1Loss()
     def forward(self, batch):
         x, target = batch
@@ -249,7 +248,7 @@ class OpenUnmixModel(torch.nn.Module):
         return self.separator(x).squeeze(1)
-class DemucsModel(torch.nn.Module):
     def __init__(self, sample_rate, **kwargs) -> None:
         super().__init__()
         self.model = HDemucs(**kwargs)
@@ -257,7 +256,7 @@ class DemucsModel(torch.nn.Module):
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=sample_rate
         )
-        self.l1loss = torch.nn.L1Loss()
     def forward(self, batch):
         x, target = batch
@@ -284,201 +283,42 @@ class DiffusionGenerationModel(nn.Module):
         return self.model.sample(noise, num_steps=num_steps)
-def log_wandb_audio_batch(
-    logger: pl.loggers.WandbLogger,
-    id: str,
-    samples: Tensor,
-    sampling_rate: int,
-    caption: str = "",
-    max_items: int = 10,
-):
-    num_items = samples.shape[0]
-    samples = rearrange(samples, "b c t -> b t c")
-    for idx in range(num_items):
-        if idx >= max_items:
-            break
-        logger.experiment.log(
-            {
-                f"{id}_{idx}": wandb.Audio(
-                    samples[idx].cpu().numpy(),
-                    caption=caption,
-                    sample_rate=sampling_rate,
-                )
-            }
-        )
-def spectrogram(
-    x: torch.Tensor,
-    window: torch.Tensor,
-    n_fft: int,
-    hop_length: int,
-    alpha: float,
-) -> torch.Tensor:
-    bs, chs, samp = x.size()
-    x = x.view(bs * chs, -1)  # move channels onto batch dim
-    X = torch.stft(
-        x,
-        n_fft=n_fft,
-        hop_length=hop_length,
-        window=window,
-        return_complex=True,
-    )
-    # move channels back
-    X = X.view(bs, chs, X.shape[-2], X.shape[-1])
-    return torch.pow(X.abs() + 1e-8, alpha)
-# adapted from https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
-def init_layer(layer):
-    """Initialize a Linear or Convolutional layer."""
-    nn.init.xavier_uniform_(layer.weight)
-    if hasattr(layer, "bias"):
-        if layer.bias is not None:
-            layer.bias.data.fill_(0.0)
-def init_bn(bn):
-    """Initialize a Batchnorm layer."""
-    bn.bias.data.fill_(0.0)
-    bn.weight.data.fill_(1.0)
-class ConvBlock(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super(ConvBlock, self).__init__()
-        self.conv1 = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=(3, 3),
-            stride=(1, 1),
-            padding=(1, 1),
-            bias=False,
-        )
-        self.conv2 = nn.Conv2d(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=(3, 3),
-            stride=(1, 1),
-            padding=(1, 1),
-            bias=False,
         )
-        self.bn1 = nn.BatchNorm2d(out_channels)
-        self.bn2 = nn.BatchNorm2d(out_channels)
-        self.init_weight()
-    def init_weight(self):
-        init_layer(self.conv1)
-        init_layer(self.conv2)
-        init_bn(self.bn1)
-        init_bn(self.bn2)
-    def forward(self, input, pool_size=(2, 2), pool_type="avg"):
-        x = input
-        x = F.relu_(self.bn1(self.conv1(x)))
-        x = F.relu_(self.bn2(self.conv2(x)))
-        if pool_type == "max":
-            x = F.max_pool2d(x, kernel_size=pool_size)
-        elif pool_type == "avg":
-            x = F.avg_pool2d(x, kernel_size=pool_size)
-        elif pool_type == "avg+max":
-            x1 = F.avg_pool2d(x, kernel_size=pool_size)
-            x2 = F.max_pool2d(x, kernel_size=pool_size)
-            x = x1 + x2
-        else:
-            raise Exception("Incorrect argument!")
-        return x
-class Cnn14(nn.Module):
-    def __init__(
-        self,
-        num_classes: int,
-        sample_rate: float,
-        n_fft: int = 2048,
-        hop_length: int = 512,
-        n_mels: int = 128,
-    ):
         super().__init__()
-        self.num_classes = num_classes
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        window = torch.hann_window(n_fft)
-        self.register_buffer("window", window)
-        self.melspec = torchaudio.transforms.MelSpectrogram(
-            sample_rate,
-            n_fft,
-            hop_length=hop_length,
-            n_mels=n_mels,
         )
-        self.bn0 = nn.BatchNorm2d(n_mels)
-        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
-        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
-        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
-        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
-        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
-        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
-        self.fc1 = nn.Linear(2048, 2048, bias=True)
-        self.fc_audioset = nn.Linear(2048, num_classes, bias=True)
-        self.init_weight()
-    def init_weight(self):
-        init_bn(self.bn0)
-        init_layer(self.fc1)
-        init_layer(self.fc_audioset)
-    def forward(self, x: torch.Tensor):
-        """
-        Input: (batch_size, data_length)"""
-        x = self.melspec(x)
-        x = x.permute(0, 2, 1, 3)
-        x = self.bn0(x)
-        x = x.permute(0, 2, 1, 3)
-        if self.training:
-            pass
-            # x = self.spec_augmenter(x)
-        x = self.conv_block1(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block2(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block3(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block4(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block5(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block6(x, pool_size=(1, 1), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = torch.mean(x, dim=3)
-        (x1, _) = torch.max(x, dim=2)
-        x2 = torch.mean(x, dim=2)
-        x = x1 + x2
-        x = F.dropout(x, p=0.5, training=self.training)
-        x = F.relu_(self.fc1(x))
-        clipwise_output = self.fc_audioset(x)
-        return clipwise_output
 class FXClassifier(pl.LightningModule):
@@ -501,7 +341,7 @@ class FXClassifier(pl.LightningModule):
     def common_step(self, batch, batch_idx, mode: str = "train"):
         x, y, dry_label, wet_label = batch
         pred_label = self.network(x)
-        loss = torch.nn.functional.cross_entropy(pred_label, dry_label)
         self.log(
             f"{mode}_loss",
             loss,

 import torch
 import torchmetrics
 import pytorch_lightning as pl
 from torch import Tensor, nn
 from einops import rearrange
 from torchaudio.models import HDemucs
 from auraloss.freq import MultiResolutionSTFTLoss
 from umx.openunmix.model import OpenUnmix, Separator
+from utils import FADLoss, spectrogram, log_wandb_audio_batch
+from dptnet import DPTNet_base
+from dcunet import RefineSpectrogramUnet
+class RemFX(pl.LightningModule):
     def __init__(
         self,
         lr: float,
         self.lr_weight_decay = lr_weight_decay
         self.sample_rate = sample_rate
         self.model = network
+        self.metrics = nn.ModuleDict(
             {
                 "SISDR": SISDRLoss(),
                 "STFT": MultiResolutionSTFTLoss(),
         return loss
     def common_step(self, batch, batch_idx, mode: str = "train"):
+        x, y, _, _ = batch  # x, y = (B, C, T), (B, C, T)
         loss, output = self.model((x, y))
         self.log(f"{mode}_loss", loss)
         # Metric logging
         )
+class OpenUnmixModel(nn.Module):
     def __init__(
         self,
         n_fft: int = 2048,
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=self.sample_rate
         )
+        self.l1loss = nn.L1Loss()
     def forward(self, batch):
         x, target = batch
         return self.separator(x).squeeze(1)
+class DemucsModel(nn.Module):
     def __init__(self, sample_rate, **kwargs) -> None:
         super().__init__()
         self.model = HDemucs(**kwargs)
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=sample_rate
         )
+        self.l1loss = nn.L1Loss()
     def forward(self, batch):
         x, target = batch
         return self.model.sample(noise, num_steps=num_steps)
+class DPTNetModel(nn.Module):
+    def __init__(self, sample_rate, **kwargs):
+        super().__init__()
+        self.model = DPTNet_base(**kwargs)
+        self.mrstftloss = MultiResolutionSTFTLoss(
+            n_bins=self.num_bins, sample_rate=sample_rate
         )
+        self.l1loss = nn.L1Loss()
+    def forward(self, batch):
+        x, target = batch
+        output = self.model(x).squeeze(1)
+        loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
+        return loss, output
+    def sample(self, x: Tensor) -> Tensor:
+        return self.model.sample(x)
+class DCUNetModel(nn.Module):
+    def __init__(self, sample_rate, **kwargs):
         super().__init__()
+        self.model = RefineSpectrogramUnet(**kwargs)
+        self.mrstftloss = MultiResolutionSTFTLoss(
+            n_bins=self.num_bins, sample_rate=sample_rate
         )
+        self.l1loss = nn.L1Loss()
+    def forward(self, batch):
+        x, target = batch
+        output = self.model(x).squeeze(1)
+        loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
+        return loss, output
+    def sample(self, x: Tensor) -> Tensor:
+        return self.model.sample(x)
 class FXClassifier(pl.LightningModule):
     def common_step(self, batch, batch_idx, mode: str = "train"):
         x, y, dry_label, wet_label = batch
         pred_label = self.network(x)
+        loss = nn.functional.cross_entropy(pred_label, dry_label)
         self.log(
             f"{mode}_loss",
             loss,

remfx/utils.py CHANGED Viewed

@@ -7,6 +7,10 @@ from frechet_audio_distance import FrechetAudioDistance
 import numpy as np
 import torch
 import torchaudio
 def get_logger(name=__name__) -> logging.Logger:
@@ -138,3 +142,91 @@ def create_sequential_chunks(
             break
         chunks.append(audio[:, start : start + chunk_size])
     return chunks, sr

 import numpy as np
 import torch
 import torchaudio
+from torch import Tensor, nn
+import wandb
+from einops import rearrange
+from torch._six import container_abcs
 def get_logger(name=__name__) -> logging.Logger:
             break
         chunks.append(audio[:, start : start + chunk_size])
     return chunks, sr
+def log_wandb_audio_batch(
+    logger: pl.loggers.WandbLogger,
+    id: str,
+    samples: Tensor,
+    sampling_rate: int,
+    caption: str = "",
+    max_items: int = 10,
+):
+    num_items = samples.shape[0]
+    samples = rearrange(samples, "b c t -> b t c")
+    for idx in range(num_items):
+        if idx >= max_items:
+            break
+        logger.experiment.log(
+            {
+                f"{id}_{idx}": wandb.Audio(
+                    samples[idx].cpu().numpy(),
+                    caption=caption,
+                    sample_rate=sampling_rate,
+                )
+            }
+        )
+def spectrogram(
+    x: torch.Tensor,
+    window: torch.Tensor,
+    n_fft: int,
+    hop_length: int,
+    alpha: float,
+) -> torch.Tensor:
+    bs, chs, samp = x.size()
+    x = x.view(bs * chs, -1)  # move channels onto batch dim
+    X = torch.stft(
+        x,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        window=window,
+        return_complex=True,
+    )
+    # move channels back
+    X = X.view(bs, chs, X.shape[-2], X.shape[-1])
+    return torch.pow(X.abs() + 1e-8, alpha)
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer."""
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, "bias"):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.0)
+def init_bn(bn):
+    """Initialize a Batchnorm layer."""
+    bn.bias.data.fill_(0.0)
+    bn.weight.data.fill_(1.0)
+def _ntuple(n: int):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return x
+        return tuple([x] * n)
+    return parse
+single = _ntuple(1)
+def concat_complex(a: torch.tensor, b: torch.tensor, dim: int = 1) -> torch.tensor:
+    """
+    Concatenate two complex tensors in same dimension concept
+    :param a: complex tensor
+    :param b: another complex tensor
+    :param dim: target dimension
+    :return: concatenated tensor
+    """
+    a_real, a_img = a.chunk(2, dim)
+    b_real, b_img = b.chunk(2, dim)
+    return torch.cat([a_real, b_real, a_img, b_img], dim=dim)

scripts/test.py CHANGED Viewed

@@ -3,7 +3,6 @@ import hydra
 from omegaconf import DictConfig
 import remfx.utils as utils
 from pytorch_lightning.utilities.model_summary import ModelSummary
-from remfx.models import RemFXModel
 import torch
 log = utils.get_logger(__name__)

 from omegaconf import DictConfig
 import remfx.utils as utils
 from pytorch_lightning.utilities.model_summary import ModelSummary
 import torch
 log = utils.get_logger(__name__)