Update README.md
Browse files
README.md
CHANGED
@@ -16,18 +16,17 @@ tags:
|
|
16 |
- speech-emotion-recognition
|
17 |
- dkounadis
|
18 |
---
|
19 |
-
Tecaher model based on [
|
20 |
-
Acieves
|
21 |
|
22 |
|
23 |
|
24 |
# Benchmarks
|
25 |
-
CCC based on Test1 and Development sets of the Odyssey Competition
|
26 |
<table style="width:500px">
|
27 |
-
<tr><th colspan=6 align="center" >
|
28 |
-
<tr><th colspan=3 align="center">Test
|
29 |
<tr> <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
|
30 |
-
<tr> <td> 0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> </tr>
|
31 |
</table>
|
32 |
|
33 |
|
@@ -35,9 +34,9 @@ CCC based on Test1 and Development sets of the Odyssey Competition
|
|
35 |
# Usage
|
36 |
```python
|
37 |
from transformers import AutoModelForAudioClassification
|
38 |
-
import librosa
|
|
|
39 |
import types
|
40 |
-
import numpy as np
|
41 |
import torch.nn as nn
|
42 |
from transformers.models.wav2vec2.modeling_wav2vec2 import (
|
43 |
Wav2Vec2Model,
|
@@ -46,31 +45,27 @@ from transformers.models.wav2vec2.modeling_wav2vec2 import (
|
|
46 |
|
47 |
device = 'cuda:0'
|
48 |
|
|
|
49 |
class RegressionHead(nn.Module):
|
50 |
-
r"""
|
51 |
|
52 |
def __init__(self, config):
|
53 |
|
54 |
super().__init__()
|
55 |
|
56 |
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
57 |
-
self.dropout = nn.Dropout(config.final_dropout)
|
58 |
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
|
59 |
|
60 |
-
def forward(self,
|
61 |
|
62 |
-
x = features
|
63 |
-
x = self.dropout(x)
|
64 |
x = self.dense(x)
|
65 |
x = torch.tanh(x)
|
66 |
-
x = self.dropout(x)
|
67 |
-
x = self.out_proj(x)
|
68 |
|
69 |
-
return x
|
70 |
|
71 |
|
72 |
class Dawn(Wav2Vec2PreTrainedModel):
|
73 |
-
r"""https://
|
74 |
|
75 |
def __init__(self, config):
|
76 |
|
@@ -81,18 +76,18 @@ class Dawn(Wav2Vec2PreTrainedModel):
|
|
81 |
|
82 |
def forward(
|
83 |
self,
|
84 |
-
|
85 |
):
|
86 |
|
87 |
-
x =
|
88 |
variance = (x * x).mean(1, keepdim=True) + 1e-7
|
89 |
-
|
90 |
-
|
|
|
91 |
|
92 |
-
return self.classifier(hidden_states.mean(1)).clip(0, 1)
|
93 |
|
94 |
def _infer(self, x):
|
95 |
-
'''
|
96 |
# x = (x + 8.278621631819787e-05) / 0.08485610250851999
|
97 |
x = (x + self.config.mean) / self.config.std
|
98 |
x = self.ssl_model(x, attention_mask=None).last_hidden_state
|
@@ -101,35 +96,45 @@ def _infer(self, x):
|
|
101 |
w = torch.matmul(h, self.pool_model.attention)
|
102 |
w = w.softmax(1)
|
103 |
mu = torch.sum(x * w, 1)
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
106 |
return self.ser_model(x).clip(0, 1)
|
107 |
|
|
|
|
|
108 |
# https://lab-msp.com/MSP-Podcast_Competition/leaderboard.php
|
109 |
base = AutoModelForAudioClassification.from_pretrained(
|
110 |
'3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
|
111 |
-
trust_remote_code=True # extra
|
112 |
-
|
113 |
base.forward = types.MethodType(_infer, base)
|
114 |
|
115 |
-
#
|
|
|
116 |
dawn = Dawn.from_pretrained(
|
117 |
'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
|
118 |
-
|
|
|
|
|
119 |
|
120 |
def wav2small(x):
|
|
|
121 |
return .5 * dawn(x) + .5 * base(x)
|
122 |
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
# load an audio file
|
128 |
x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)
|
129 |
|
130 |
with torch.no_grad():
|
131 |
-
pred = wav2small(
|
|
|
|
|
|
|
132 |
|
|
|
|
|
|
|
133 |
|
134 |
-
print(f'\n\narousal = {pred[0, 0]}\ndominance= {pred[0, 1]}\nvalence = {pred[0, 2]}')
|
135 |
```
|
|
|
16 |
- speech-emotion-recognition
|
17 |
- dkounadis
|
18 |
---
|
19 |
+
Tecaher model based on [Wavlm](https://huggingface.co/3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes) and [wav2vec2](https://hf.rst.im/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim) for arousal, dominance, valence prediction.
|
20 |
+
Acieves 0.68 valence CCC on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) Test1.
|
21 |
|
22 |
|
23 |
|
24 |
# Benchmarks
|
|
|
25 |
<table style="width:500px">
|
26 |
+
<tr><th colspan=6 align="center" >CCC MSP Podcast v1.7</th></tr>
|
27 |
+
<tr><th colspan=3 align="center">Test 1</th><th colspan=3 align="center">Test 2</th></tr>
|
28 |
<tr> <td>Val</td> <td>Dom</td> <td>Aro</td> <td>Val</td> <td>Dom</td> <td>Aro</td> </tr>
|
29 |
+
<tr> <td> 0.6760566 </td> <td>0.6840190</td> <td>0.7620374</td> <td>0.4229267</td> <td>0.4684658</td> <td>0.4857733</td> </tr>
|
30 |
</table>
|
31 |
|
32 |
|
|
|
34 |
# Usage
|
35 |
```python
|
36 |
from transformers import AutoModelForAudioClassification
|
37 |
+
import librosa
|
38 |
+
import torch
|
39 |
import types
|
|
|
40 |
import torch.nn as nn
|
41 |
from transformers.models.wav2vec2.modeling_wav2vec2 import (
|
42 |
Wav2Vec2Model,
|
|
|
45 |
|
46 |
device = 'cuda:0'
|
47 |
|
48 |
+
|
49 |
class RegressionHead(nn.Module):
|
50 |
+
r"""A/D/V"""
|
51 |
|
52 |
def __init__(self, config):
|
53 |
|
54 |
super().__init__()
|
55 |
|
56 |
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
|
|
57 |
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
|
58 |
|
59 |
+
def forward(self, x):
|
60 |
|
|
|
|
|
61 |
x = self.dense(x)
|
62 |
x = torch.tanh(x)
|
|
|
|
|
63 |
|
64 |
+
return self.out_proj(x)
|
65 |
|
66 |
|
67 |
class Dawn(Wav2Vec2PreTrainedModel):
|
68 |
+
r"""https://arxiv.org/abs/2203.07378"""
|
69 |
|
70 |
def __init__(self, config):
|
71 |
|
|
|
76 |
|
77 |
def forward(
|
78 |
self,
|
79 |
+
x,
|
80 |
):
|
81 |
|
82 |
+
x = x - x.mean(1, keepdim=True)
|
83 |
variance = (x * x).mean(1, keepdim=True) + 1e-7
|
84 |
+
out = self.wav2vec2(x / variance.sqrt())
|
85 |
+
|
86 |
+
return self.classifier(out[0].mean(1)).clip(0, 1)
|
87 |
|
|
|
88 |
|
89 |
def _infer(self, x):
|
90 |
+
'''re-definition for less cpu'''
|
91 |
# x = (x + 8.278621631819787e-05) / 0.08485610250851999
|
92 |
x = (x + self.config.mean) / self.config.std
|
93 |
x = self.ssl_model(x, attention_mask=None).last_hidden_state
|
|
|
96 |
w = torch.matmul(h, self.pool_model.attention)
|
97 |
w = w.softmax(1)
|
98 |
mu = torch.sum(x * w, 1)
|
99 |
+
x = torch.cat(
|
100 |
+
[
|
101 |
+
mu,
|
102 |
+
((x * x * w).sum(1) - mu * mu).clamp(min=1e-5).sqrt()
|
103 |
+
], 1)
|
104 |
return self.ser_model(x).clip(0, 1)
|
105 |
|
106 |
+
# WavLM
|
107 |
+
|
108 |
# https://lab-msp.com/MSP-Podcast_Competition/leaderboard.php
|
109 |
base = AutoModelForAudioClassification.from_pretrained(
|
110 |
'3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
|
111 |
+
trust_remote_code=True # extra definitions see above repository
|
112 |
+
).to(device).eval()
|
113 |
base.forward = types.MethodType(_infer, base)
|
114 |
|
115 |
+
# Wav2Vec2.0
|
116 |
+
|
117 |
dawn = Dawn.from_pretrained(
|
118 |
'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
|
119 |
+
).to(device)
|
120 |
+
|
121 |
+
# Teacher
|
122 |
|
123 |
def wav2small(x):
|
124 |
+
'''average predctions'''
|
125 |
return .5 * dawn(x) + .5 * base(x)
|
126 |
|
127 |
|
|
|
|
|
|
|
|
|
128 |
x, _ = librosa.load('test.wav', sr=base.config.sampling_rate)
|
129 |
|
130 |
with torch.no_grad():
|
131 |
+
pred = wav2small(
|
132 |
+
torch.from_numpy(x[None, :]
|
133 |
+
).to(device))
|
134 |
+
|
135 |
|
136 |
+
print(f'\narousal = {pred[0, 0]}',
|
137 |
+
f'\ndominance= {pred[0, 1]}',
|
138 |
+
f'\nvalence = {pred[0, 2]}')
|
139 |
|
|
|
140 |
```
|