dkounadis commited on
Commit
da56fb8
·
verified ·
1 Parent(s): f7e0f31

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +40 -35
README.md CHANGED
@@ -16,18 +16,17 @@ tags:
16
  - speech-emotion-recognition
17
  - dkounadis
18
  ---
19
- Tecaher model based on [wavlm](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) and [wav2vec2](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim) for arousal, dominance, valence prediction in range [0,1], used in dimensional Speech Emotion Recognition.
20
- Acieves xx CCC on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) test split.
21
 
22
 
23
 
24
  # Benchmarks
25
- CCC based on Test1 and Development sets of the Odyssey Competition
26
  <table style="width:500px">
27
- <tr><th colspan=6 align="center" >Multi-Task Setup</th></tr>
28
- <tr><th colspan=3 align="center">Test 3</th><th colspan=3 align="center">Development</th></tr>
29
  <tr> <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
30
- <tr> <td> 0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr>
31
  </table>
32
 
33
 
@@ -35,9 +34,9 @@ CCC based on Test1 and Development sets of the Odyssey Competition
35
  # Usage
36
  ```python
37
  from transformers import AutoModelForAudioClassification
38
- import librosa, torch
 
39
  import types
40
- import numpy as np
41
  import torch.nn as nn
42
  from transformers.models.wav2vec2.modeling_wav2vec2 import (
43
  Wav2Vec2Model,
@@ -46,31 +45,27 @@ from transformers.models.wav2vec2.modeling_wav2vec2 import (
46
 
47
  device = 'cuda:0'
48
 
 
49
  class RegressionHead(nn.Module):
50
- r"""Classification head."""
51
 
52
  def __init__(self, config):
53
 
54
  super().__init__()
55
 
56
  self.dense = nn.Linear(config.hidden_size, config.hidden_size)
57
- self.dropout = nn.Dropout(config.final_dropout)
58
  self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
59
 
60
- def forward(self, features, **kwargs):
61
 
62
- x = features
63
- x = self.dropout(x)
64
  x = self.dense(x)
65
  x = torch.tanh(x)
66
- x = self.dropout(x)
67
- x = self.out_proj(x)
68
 
69
- return x
70
 
71
 
72
  class Dawn(Wav2Vec2PreTrainedModel):
73
- r"""https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"""
74
 
75
  def __init__(self, config):
76
 
@@ -81,18 +76,18 @@ class Dawn(Wav2Vec2PreTrainedModel):
81
 
82
  def forward(
83
  self,
84
- input_values,
85
  ):
86
 
87
- x = input_values - input_values.mean(1, keepdim=True)
88
  variance = (x * x).mean(1, keepdim=True) + 1e-7
89
- outputs = self.wav2vec2(x / variance.sqrt())
90
- hidden_states = outputs[0]
 
91
 
92
- return self.classifier(hidden_states.mean(1)).clip(0, 1)
93
 
94
  def _infer(self, x):
95
- '''fast forward'''
96
  # x = (x + 8.278621631819787e-05) / 0.08485610250851999
97
  x = (x + self.config.mean) / self.config.std
98
  x = self.ssl_model(x, attention_mask=None).last_hidden_state
@@ -101,35 +96,45 @@ def _infer(self, x):
101
  w = torch.matmul(h, self.pool_model.attention)
102
  w = w.softmax(1)
103
  mu = torch.sum(x * w, 1)
104
- rh = torch.sqrt((torch.sum((x**2) * w, dim=1) - mu**2).clamp(min=1e-5))
105
- x = torch.cat((mu, rh), 1)
 
 
 
106
  return self.ser_model(x).clip(0, 1)
107
 
 
 
108
  # https://lab-msp.com/MSP-Podcast_Competition/leaderboard.php
109
  base = AutoModelForAudioClassification.from_pretrained(
110
  '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
111
- trust_remote_code=True # extra functions
112
- ).to(device).eval()
113
  base.forward = types.MethodType(_infer, base)
114
 
115
- # Dawn
 
116
  dawn = Dawn.from_pretrained(
117
  'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
118
- ).to(device)
 
 
119
 
120
  def wav2small(x):
 
121
  return .5 * dawn(x) + .5 * base(x)
122
 
123
 
124
-
125
-
126
-
127
- # load an audio file
128
  x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)
129
 
130
  with torch.no_grad():
131
- pred = wav2small(torch.from_numpy(x[None, :]).to(device))
 
 
 
132
 
 
 
 
133
 
134
- print(f'\n\narousal = {pred[0, 0]}\ndominance= {pred[0, 1]}\nvalence = {pred[0, 2]}')
135
  ```
 
16
  - speech-emotion-recognition
17
  - dkounadis
18
  ---
19
+ Tecaher model based on [Wavlm](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) and [wav2vec2](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim) for arousal, dominance, valence prediction.
20
+ Acieves 0.68 valence CCC on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) Test1.
21
 
22
 
23
 
24
  # Benchmarks
 
25
  <table style="width:500px">
26
+ <tr><th colspan=6 align="center" >CCC MSP Podcast v1.7</th></tr>
27
+ <tr><th colspan=3 align="center">Test 1</th><th colspan=3 align="center">Test 2</th></tr>
28
  <tr> <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
29
+ <tr> <td> 0.6760566 </td> <td>0.6840190</td> <td>0.7620374</td> <td>0.4229267</td> <td>0.4684658</td> <td>0.4857733</td> </tr>
30
  </table>
31
 
32
 
 
34
  # Usage
35
  ```python
36
  from transformers import AutoModelForAudioClassification
37
+ import librosa
38
+ import torch
39
  import types
 
40
  import torch.nn as nn
41
  from transformers.models.wav2vec2.modeling_wav2vec2 import (
42
  Wav2Vec2Model,
 
45
 
46
  device = 'cuda:0'
47
 
48
+
49
  class RegressionHead(nn.Module):
50
+ r"""A/D/V"""
51
 
52
  def __init__(self, config):
53
 
54
  super().__init__()
55
 
56
  self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 
57
  self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
58
 
59
+ def forward(self, x):
60
 
 
 
61
  x = self.dense(x)
62
  x = torch.tanh(x)
 
 
63
 
64
+ return self.out_proj(x)
65
 
66
 
67
  class Dawn(Wav2Vec2PreTrainedModel):
68
+ r"""https://arxiv.org/abs/2203.07378"""
69
 
70
  def __init__(self, config):
71
 
 
76
 
77
  def forward(
78
  self,
79
+ x,
80
  ):
81
 
82
+ x = x - x.mean(1, keepdim=True)
83
  variance = (x * x).mean(1, keepdim=True) + 1e-7
84
+ out = self.wav2vec2(x / variance.sqrt())
85
+
86
+ return self.classifier(out[0].mean(1)).clip(0, 1)
87
 
 
88
 
89
  def _infer(self, x):
90
+ '''re-definition for less cpu'''
91
  # x = (x + 8.278621631819787e-05) / 0.08485610250851999
92
  x = (x + self.config.mean) / self.config.std
93
  x = self.ssl_model(x, attention_mask=None).last_hidden_state
 
96
  w = torch.matmul(h, self.pool_model.attention)
97
  w = w.softmax(1)
98
  mu = torch.sum(x * w, 1)
99
+ x = torch.cat(
100
+ [
101
+ mu,
102
+ ((x * x * w).sum(1) - mu * mu).clamp(min=1e-5).sqrt()
103
+ ], 1)
104
  return self.ser_model(x).clip(0, 1)
105
 
106
+ # WavLM
107
+
108
  # https://lab-msp.com/MSP-Podcast_Competition/leaderboard.php
109
  base = AutoModelForAudioClassification.from_pretrained(
110
  '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
111
+ trust_remote_code=True # extra definitions see above repository
112
+ ).to(device).eval()
113
  base.forward = types.MethodType(_infer, base)
114
 
115
+ # Wav2Vec2.0
116
+
117
  dawn = Dawn.from_pretrained(
118
  'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
119
+ ).to(device)
120
+
121
+ # Teacher
122
 
123
  def wav2small(x):
124
+ '''average predctions'''
125
  return .5 * dawn(x) + .5 * base(x)
126
 
127
 
 
 
 
 
128
  x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)
129
 
130
  with torch.no_grad():
131
+ pred = wav2small(
132
+ torch.from_numpy(x[None, :]
133
+ ).to(device))
134
+
135
 
136
+ print(f'\narousal = {pred[0, 0]}',
137
+ f'\ndominance= {pred[0, 1]}',
138
+ f'\nvalence = {pred[0, 2]}')
139
 
 
140
  ```