Spaces:

mattricesound
/

RemFx

Runtime error

Christian J. Steinmetz commited on Apr 21, 2023

Commit

3c4fcfb

•

1 Parent(s): 4d2eb76

updating classifier configs and adding in kwargs to pretrained models

Files changed (8) hide show

cfg/model/cls_panns_16k.yaml CHANGED Viewed

@@ -11,5 +11,5 @@ model:
     hop_length: 512
     n_mels: 128
     sample_rate: ${sample_rate}
-    model_sample_rate: ${sample_rate}

     hop_length: 512
     n_mels: 128
     sample_rate: ${sample_rate}
+    model_sample_rate: 16000

cfg/model/{cls_panns_44k.yaml → cls_panns_48k.yaml} RENAMED Viewed

File without changes

cfg/model/cls_panns_48k_64.yaml ADDED Viewed

+# @package _global_
+model:
+  _target_: remfx.models.FXClassifier
+  lr: 3e-4
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  mixup: False
+  network:
+    _target_: remfx.classifier.Cnn14
+    num_classes: ${num_classes}
+    n_fft: 2048
+    hop_length: 512
+    n_mels: 64
+    sample_rate: ${sample_rate}
+    model_sample_rate: ${sample_rate}
+    specaugment: False

cfg/model/{cls_panns_44k_mixup.yaml → cls_panns_48k_mixup.yaml} RENAMED Viewed

File without changes

cfg/model/cls_panns_48k_specaugment.yaml ADDED Viewed

+# @package _global_
+model:
+  _target_: remfx.models.FXClassifier
+  lr: 3e-4
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  mixup: False
+  network:
+    _target_: remfx.classifier.Cnn14
+    num_classes: ${num_classes}
+    n_fft: 2048
+    hop_length: 512
+    n_mels: 128
+    sample_rate: ${sample_rate}
+    model_sample_rate: ${sample_rate}
+    specaugment: True

cfg/model/cls_panns_48k_specaugment_label_smoothing.yaml ADDED Viewed

+# @package _global_
+model:
+  _target_: remfx.models.FXClassifier
+  lr: 3e-4
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  mixup: False
+  label_smoothing: 0.15
+  network:
+    _target_: remfx.classifier.Cnn14
+    num_classes: ${num_classes}
+    n_fft: 2048
+    hop_length: 512
+    n_mels: 128
+    sample_rate: ${sample_rate}
+    model_sample_rate: ${sample_rate}
+    specaugment: True

cfg/model/cls_panns_pt.yaml CHANGED Viewed

@@ -4,6 +4,7 @@ model:
   lr: 3e-4
   lr_weight_decay: 1e-3
   sample_rate: ${sample_rate}
   network:
     _target_: remfx.classifier.PANNs
     num_classes: ${num_classes}

   lr: 3e-4
   lr_weight_decay: 1e-3
   sample_rate: ${sample_rate}
+  mixup: False
   network:
     _target_: remfx.classifier.PANNs
     num_classes: ${num_classes}

remfx/classifier.py CHANGED Viewed

@@ -31,7 +31,7 @@ class PANNs(torch.nn.Module):
             torch.nn.Linear(hidden_dim, num_classes),
         )
-    def forward(self, x: torch.Tensor):
         with torch.no_grad():
             x = self.resample(x)
             embed = panns_hear.get_scene_embeddings(x.view(x.shape[0], -1), self.model)
@@ -59,7 +59,7 @@ class Wav2CLIP(nn.Module):
             torch.nn.Linear(hidden_dim, num_classes),
         )
-    def forward(self, x: torch.Tensor):
         with torch.no_grad():
             x = self.resample(x)
             embed = wav2clip_hear.get_scene_embeddings(
@@ -89,7 +89,7 @@ class VGGish(nn.Module):
             torch.nn.Linear(hidden_dim, num_classes),
         )
-    def forward(self, x: torch.Tensor):
         with torch.no_grad():
             x = self.resample(x)
             embed = hearbaseline.vggish.get_scene_embeddings(
@@ -119,7 +119,7 @@ class wav2vec2(nn.Module):
             torch.nn.Linear(hidden_dim, num_classes),
         )
-    def forward(self, x: torch.Tensor):
         with torch.no_grad():
             x = self.resample(x)
             embed = hearbaseline.wav2vec2.get_scene_embeddings(
@@ -179,6 +179,10 @@ class Cnn14(nn.Module):
                 orig_freq=sample_rate, new_freq=model_sample_rate
             )
     def init_weight(self):
         init_bn(self.bn0)
         init_layer(self.fc1)

             torch.nn.Linear(hidden_dim, num_classes),
         )
+    def forward(self, x: torch.Tensor, **kwargs):
         with torch.no_grad():
             x = self.resample(x)
             embed = panns_hear.get_scene_embeddings(x.view(x.shape[0], -1), self.model)
             torch.nn.Linear(hidden_dim, num_classes),
         )
+    def forward(self, x: torch.Tensor, **kwargs):
         with torch.no_grad():
             x = self.resample(x)
             embed = wav2clip_hear.get_scene_embeddings(
             torch.nn.Linear(hidden_dim, num_classes),
         )
+    def forward(self, x: torch.Tensor, **kwargs):
         with torch.no_grad():
             x = self.resample(x)
             embed = hearbaseline.vggish.get_scene_embeddings(
             torch.nn.Linear(hidden_dim, num_classes),
         )
+    def forward(self, x: torch.Tensor, **kwargs):
         with torch.no_grad():
             x = self.resample(x)
             embed = hearbaseline.wav2vec2.get_scene_embeddings(
                 orig_freq=sample_rate, new_freq=model_sample_rate
             )
+        if self.specaugment:
+            self.freq_mask = torchaudio.transforms.FrequencyMasking(64, True)
+            self.time_mask = torchaudio.transforms.TimeMasking(128, True)
     def init_weight(self):
         init_bn(self.bn0)
         init_layer(self.fc1)