dkounadis
/

wav2small

@@ -16,18 +16,17 @@ tags:
 - speech-emotion-recognition
 - dkounadis
 ---
-Tecaher model based on [wavlm](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) and [wav2vec2](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim) for arousal, dominance, valence prediction in range [0,1], used in dimensional Speech Emotion Recognition.
-Acieves xx CCC on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) test split.
 # Benchmarks
-CCC based on Test1 and Development sets of the Odyssey Competition
 <table style="width:500px">
-  <tr><th colspan=6 align="center" >Multi-Task Setup</th></tr>
-  <tr><th colspan=3 align="center">Test 3</th><th colspan=3 align="center">Development</th></tr>
   <tr>   <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
-  <tr>  <td> 0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr>
 </table>
@@ -35,9 +34,9 @@ CCC based on Test1 and Development sets of the Odyssey Competition
 # Usage
 ```python
 from transformers import AutoModelForAudioClassification
-import librosa, torch
 import types
-import numpy as np
 import torch.nn as nn
 from transformers.models.wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Model,
@@ -46,31 +45,27 @@ from transformers.models.wav2vec2.modeling_wav2vec2 import (
 device = 'cuda:0'
 class RegressionHead(nn.Module):
-    r"""Classification head."""
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.final_dropout)
         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-    def forward(self, features, **kwargs):
-        x = features
-        x = self.dropout(x)
         x = self.dense(x)
         x = torch.tanh(x)
-        x = self.dropout(x)
-        x = self.out_proj(x)
-        return x
 class Dawn(Wav2Vec2PreTrainedModel):
-    r"""https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"""
     def __init__(self, config):
@@ -81,18 +76,18 @@ class Dawn(Wav2Vec2PreTrainedModel):
     def forward(
             self,
-            input_values,
     ):
-        x = input_values - input_values.mean(1, keepdim=True)
         variance = (x * x).mean(1, keepdim=True) + 1e-7
-        outputs = self.wav2vec2(x / variance.sqrt())
-        hidden_states = outputs[0]
-        return self.classifier(hidden_states.mean(1)).clip(0, 1)
 def _infer(self, x):
-    '''fast forward'''
     # x = (x + 8.278621631819787e-05) / 0.08485610250851999
     x = (x + self.config.mean) / self.config.std
     x = self.ssl_model(x, attention_mask=None).last_hidden_state
@@ -101,35 +96,45 @@ def _infer(self, x):
     w = torch.matmul(h, self.pool_model.attention)
     w = w.softmax(1)
     mu = torch.sum(x * w, 1)
-    rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
-    x = torch.cat((mu, rh), 1)
     return self.ser_model(x).clip(0, 1)
 # https://lab-msp.com/MSP-Podcast_Competition/leaderboard.php
 base = AutoModelForAudioClassification.from_pretrained(
     '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
-    trust_remote_code=True  # extra functions
-    ).to(device).eval()
 base.forward = types.MethodType(_infer, base)
-# Dawn
 dawn = Dawn.from_pretrained(
     'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
-    ).to(device)
 def wav2small(x):
     return .5 * dawn(x) + .5 * base(x)
-# load an audio file
 x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)
 with torch.no_grad():
-    pred = wav2small(torch.from_numpy(x[None, :]).to(device))
-print(f'\n\narousal  = {pred[0, 0]}\ndominance= {pred[0, 1]}\nvalence  = {pred[0, 2]}')
 ```

 - speech-emotion-recognition
 - dkounadis
 ---
+Tecaher model based on [Wavlm](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) and [wav2vec2](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim) for arousal, dominance, valence prediction.
+Acieves 0.68 valence CCC on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) Test1.
 # Benchmarks
 <table style="width:500px">
+  <tr><th colspan=6 align="center" >CCC MSP Podcast v1.7</th></tr>
+  <tr><th colspan=3 align="center">Test 1</th><th colspan=3 align="center">Test 2</th></tr>
   <tr>   <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
+  <tr>  <td> 0.6760566 </td> <td>0.6840190</td> <td>0.7620374</td> <td>0.4229267</td> <td>0.4684658</td> <td>0.4857733</td> </tr>
 </table>
 # Usage
 ```python
 from transformers import AutoModelForAudioClassification
+import librosa
+import torch
 import types
 import torch.nn as nn
 from transformers.models.wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Model,
 device = 'cuda:0'
 class RegressionHead(nn.Module):
+    r"""A/D/V"""
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, x):
         x = self.dense(x)
         x = torch.tanh(x)
+        return self.out_proj(x)
 class Dawn(Wav2Vec2PreTrainedModel):
+    r"""https://arxiv.org/abs/2203.07378"""
     def __init__(self, config):
     def forward(
             self,
+            x,
     ):
+        x = x - x.mean(1, keepdim=True)
         variance = (x * x).mean(1, keepdim=True) + 1e-7
+        out = self.wav2vec2(x / variance.sqrt())
+        return self.classifier(out[0].mean(1)).clip(0, 1)
 def _infer(self, x):
+    '''re-definition for less cpu'''
     # x = (x + 8.278621631819787e-05) / 0.08485610250851999
     x = (x + self.config.mean) / self.config.std
     x = self.ssl_model(x, attention_mask=None).last_hidden_state
     w = torch.matmul(h, self.pool_model.attention)
     w = w.softmax(1)
     mu = torch.sum(x * w, 1)
+    x = torch.cat(
+            [
+                mu,
+                ((x * x * w).sum(1) - mu * mu).clamp(min=1e-5).sqrt()
+            ], 1)
     return self.ser_model(x).clip(0, 1)
+# WavLM
 # https://lab-msp.com/MSP-Podcast_Competition/leaderboard.php
 base = AutoModelForAudioClassification.from_pretrained(
     '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
+    trust_remote_code=True  # extra definitions see above repository
+).to(device).eval()
 base.forward = types.MethodType(_infer, base)
+# Wav2Vec2.0
 dawn = Dawn.from_pretrained(
     'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
+).to(device)
+# Teacher
 def wav2small(x):
+    '''average predctions'''
     return .5 * dawn(x) + .5 * base(x)
 x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)
 with torch.no_grad():
+    pred = wav2small(
+        torch.from_numpy(x[None, :]
+                         ).to(device))
+print(f'\narousal  = {pred[0, 0]}',
+      f'\ndominance= {pred[0, 1]}',
+      f'\nvalence  = {pred[0, 2]}')
 ```