Spaces:

mattricesound
/

RemFx

Runtime error

App Files Files Community

Christian J. Steinmetz commited on Mar 7, 2023

Commit

7ac8557

•

1 Parent(s): a3e84f7

adding multi-label classification task with CNN

Browse files

Files changed (3) hide show

cfg/model/classifier.yaml +14 -0
cfg/model/umx.yaml +0 -2
remfx/models.py +245 -5

cfg/model/classifier.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+# @package _global_
+model:
+  _target_: remfx.models.FXClassifier
+  lr: 1e-4
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  network:
+    _target_: remfx.models.Cnn14
+    num_classes: ${num_classes}
+    n_fft: 4096
+    hop_length: 512
+    n_mels: 128
+    sample_rate: ${sample_rate}

cfg/model/umx.yaml CHANGED Viewed

@@ -11,7 +11,5 @@ model:
     _target_: remfx.models.OpenUnmixModel
     n_fft: 2048
     hop_length: 512
-    n_channels: 1
-    alpha: 0.3
     sample_rate: ${sample_rate}

     _target_: remfx.models.OpenUnmixModel
     n_fft: 2048
     hop_length: 512
     sample_rate: ${sample_rate}

remfx/models.py CHANGED Viewed

@@ -1,15 +1,19 @@
 import torch
-from torch import Tensor, nn
 import pytorch_lightning as pl
 from einops import rearrange
-import wandb
 from audio_diffusion_pytorch import DiffusionModel
 from auraloss.time import SISDRLoss
 from auraloss.freq import MultiResolutionSTFTLoss
-from remfx.utils import FADLoss
 from umx.openunmix.model import OpenUnmix, Separator
-from torchaudio.models import HDemucs
 class RemFXModel(pl.LightningModule):
@@ -326,3 +330,239 @@ def spectrogram(
     X = X.view(bs, chs, X.shape[-2], X.shape[-1])
     return torch.pow(X.abs() + 1e-8, alpha)

+import wandb
 import torch
+import torchaudio
+import torchmetrics
 import pytorch_lightning as pl
+import torch.nn.functional as F
+from torch import Tensor, nn
 from einops import rearrange
+from torchaudio.models import HDemucs
 from audio_diffusion_pytorch import DiffusionModel
 from auraloss.time import SISDRLoss
 from auraloss.freq import MultiResolutionSTFTLoss
 from umx.openunmix.model import OpenUnmix, Separator
+from remfx.utils import FADLoss
 class RemFXModel(pl.LightningModule):
     X = X.view(bs, chs, X.shape[-2], X.shape[-1])
     return torch.pow(X.abs() + 1e-8, alpha)
+# adapted from https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer."""
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, "bias"):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.0)
+def init_bn(bn):
+    """Initialize a Batchnorm layer."""
+    bn.bias.data.fill_(0.0)
+    bn.weight.data.fill_(1.0)
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+    def forward(self, input, pool_size=(2, 2), pool_type="avg"):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == "max":
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg":
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg+max":
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception("Incorrect argument!")
+        return x
+class Cnn14(nn.Module):
+    def __init__(
+        self,
+        num_classes: int,
+        sample_rate: float,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        n_mels: int = 128,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        window = torch.hann_window(n_fft)
+        self.register_buffer("window", window)
+        self.melspec = torchaudio.transforms.MelSpectrogram(
+            sample_rate,
+            n_fft,
+            hop_length=hop_length,
+            n_mels=n_mels,
+        )
+        self.bn0 = nn.BatchNorm2d(n_mels)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        self.fc1 = nn.Linear(2048, 2048, bias=True)
+        self.fc_audioset = nn.Linear(2048, num_classes, bias=True)
+        self.init_weight()
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+    def forward(self, x: torch.Tensor):
+        """
+        Input: (batch_size, data_length)"""
+        x = self.melspec(x)
+        x = x.permute(0, 2, 1, 3)
+        x = self.bn0(x)
+        x = x.permute(0, 2, 1, 3)
+        if self.training:
+            pass
+            # x = self.spec_augmenter(x)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        clipwise_output = self.fc_audioset(x)
+        return clipwise_output
+def spectrogram(
+    x: torch.Tensor,
+    window: torch.Tensor,
+    n_fft: int,
+    hop_length: int,
+    alpha: float,
+) -> torch.Tensor:
+    bs, chs, samp = x.size()
+    x = x.view(bs * chs, -1)  # move channels onto batch dim
+    X = torch.stft(
+        x,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        window=window,
+        return_complex=True,
+    )
+    # move channels back
+    X = X.view(bs, chs, X.shape[-2], X.shape[-1])
+    return torch.pow(X.abs() + 1e-8, alpha)
+class FXClassifier(pl.LightningModule):
+    def __init__(
+        self,
+        lr: float,
+        lr_weight_decay: float,
+        sample_rate: float,
+        network: nn.Module,
+    ):
+        super().__init__()
+        self.lr = lr
+        self.lr_weight_decay = lr_weight_decay
+        self.sample_rate = sample_rate
+        self.network = network
+    def forward(self, x: torch.Tensor):
+        return self.network(x)
+    def common_step(self, batch, batch_idx, mode: str = "train"):
+        x, y, dry_label, wet_label = batch
+        pred_label = self.network(x)
+        loss = torch.nn.functional.cross_entropy(pred_label, dry_label)
+        self.log(
+            f"{mode}_loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+            sync_dist=True,
+        )
+        self.log(
+            f"{mode}_mAP",
+            torchmetrics.functional.retrieval_average_precision(
+                pred_label, dry_label.long()
+            ),
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+            sync_dist=True,
+        )
+        return loss
+    def training_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="train")
+    def validation_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="valid")
+    def test_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="test")
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.network.parameters(),
+            lr=self.lr,
+            weight_decay=self.lr_weight_decay,
+        )
+        return optimizer