add styles
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- mimic3_make_harvard_sentences.py +88 -6
- style_vector/en_UK_apope.wav +0 -0
- style_vector/en_US_cmu_arctic_aew.wav +0 -0
- style_vector/en_US_cmu_arctic_ahw.wav +0 -0
- style_vector/en_US_cmu_arctic_aup.wav +0 -0
- style_vector/en_US_cmu_arctic_awbrms.wav +0 -0
- style_vector/en_US_cmu_arctic_axb.wav +0 -0
- style_vector/en_US_cmu_arctic_bdl.wav +0 -0
- style_vector/en_US_cmu_arctic_clb.wav +0 -0
- style_vector/en_US_cmu_arctic_eey.wav +0 -0
- style_vector/en_US_cmu_arctic_fem.wav +0 -0
- style_vector/en_US_cmu_arctic_gka.wav +0 -0
- style_vector/en_US_cmu_arctic_jmk.wav +0 -0
- style_vector/en_US_cmu_arctic_ksp.wav +0 -0
- style_vector/en_US_cmu_arctic_ljm.wav +0 -0
- style_vector/en_US_cmu_arctic_lnh.wav +0 -0
- style_vector/en_US_cmu_arctic_rxr.wav +0 -0
- style_vector/en_US_cmu_arctic_slp.wav +0 -0
- style_vector/en_US_cmu_arctic_slt.wav +0 -0
- style_vector/en_US_hifi-tts_6097.wav +0 -0
- style_vector/en_US_hifi-tts_9017.wav +0 -0
- style_vector/en_US_hifi-tts_92.wav +0 -0
- style_vector/en_US_ljspeech.wav +0 -0
- style_vector/en_US_m-ailabs_elliot_miller.wav +0 -0
- style_vector/en_US_m-ailabs_judy_bieber.wav +0 -0
- style_vector/en_US_m-ailabs_mary_ann.wav +0 -0
- style_vector/en_US_vctk_p225.wav +0 -0
- style_vector/en_US_vctk_p226.wav +0 -0
- style_vector/en_US_vctk_p227.wav +0 -0
- style_vector/en_US_vctk_p228.wav +0 -0
- style_vector/en_US_vctk_p229.wav +0 -0
- style_vector/en_US_vctk_p230.wav +0 -0
- style_vector/en_US_vctk_p231.wav +0 -0
- style_vector/en_US_vctk_p232.wav +0 -0
- style_vector/en_US_vctk_p233.wav +0 -0
- style_vector/en_US_vctk_p234.wav +0 -0
- style_vector/en_US_vctk_p236.wav +0 -0
- style_vector/en_US_vctk_p237.wav +0 -0
- style_vector/en_US_vctk_p238.wav +0 -0
- style_vector/en_US_vctk_p239.wav +0 -0
- style_vector/en_US_vctk_p240.wav +0 -0
- style_vector/en_US_vctk_p241.wav +0 -0
- style_vector/en_US_vctk_p243.wav +0 -0
- style_vector/en_US_vctk_p244.wav +0 -0
- style_vector/en_US_vctk_p245.wav +0 -0
- style_vector/en_US_vctk_p246.wav +0 -0
- style_vector/en_US_vctk_p247.wav +0 -0
- style_vector/en_US_vctk_p248.wav +0 -0
- style_vector/en_US_vctk_p249.wav +0 -0
- style_vector/en_US_vctk_p250.wav +0 -0
mimic3_make_harvard_sentences.py
CHANGED
@@ -77,6 +77,21 @@ list_voices = [
|
|
77 |
|
78 |
|
79 |
# ================================================== INTERFACE MODELS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
LABELS = [
|
81 |
'arousal', 'dominance', 'valence',
|
82 |
# 'speech_synthesizer', 'synthetic_singing',
|
@@ -131,10 +146,77 @@ teacher_cat.forward = types.MethodType(_infer, teacher_cat)
|
|
131 |
|
132 |
|
133 |
|
134 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
-
# audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
|
137 |
-
adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')
|
138 |
|
139 |
def process_function(x, sampling_rate, idx):
|
140 |
'''run audioset ct, adv
|
@@ -154,7 +236,7 @@ def process_function(x, sampling_rate, idx):
|
|
154 |
# logits_audioset = audioset_model(x, 16000)['logits_sounds']
|
155 |
# logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
|
156 |
# --
|
157 |
-
logits_adv =
|
158 |
|
159 |
cat = np.concatenate([logits_adv,
|
160 |
# _sigmoid(logits_audioset),
|
@@ -169,7 +251,7 @@ interface = audinterface.Feature(
|
|
169 |
# process_func_args={'outputs': 'logits_scene'},
|
170 |
process_func_applies_sliding_window=False,
|
171 |
win_dur=7.0,
|
172 |
-
hop_dur=
|
173 |
sampling_rate=16000,
|
174 |
resample=True,
|
175 |
verbose=True,
|
@@ -297,7 +379,7 @@ for _id, _voice in enumerate(list_voices):
|
|
297 |
total_audio_mimic3 = []
|
298 |
total_audio_styletts2 = []
|
299 |
ix = 0
|
300 |
-
for list_of_10 in harvard_individual_sentences[:
|
301 |
|
302 |
text = ' '.join(list_of_10['sentences'])
|
303 |
|
|
|
77 |
|
78 |
|
79 |
# ================================================== INTERFACE MODELS
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
LABELS = [
|
96 |
'arousal', 'dominance', 'valence',
|
97 |
# 'speech_synthesizer', 'synthetic_singing',
|
|
|
146 |
|
147 |
|
148 |
|
149 |
+
# ===================[:]===================== Dawn
|
150 |
+
def _prenorm(x, attention_mask=None):
|
151 |
+
'''mean/var'''
|
152 |
+
if attention_mask is not None:
|
153 |
+
N = attention_mask.sum(1, keepdim=True) # here attn msk is unprocessed just the original input
|
154 |
+
x -= x.sum(1, keepdim=True) / N
|
155 |
+
var = (x * x).sum(1, keepdim=True) / N
|
156 |
+
|
157 |
+
else:
|
158 |
+
x -= x.mean(1, keepdim=True) # mean is an onnx operator reducemean saves some ops compared to casting integer N to float and the div
|
159 |
+
var = (x * x).mean(1, keepdim=True)
|
160 |
+
return x / torch.sqrt(var + 1e-7)
|
161 |
+
|
162 |
+
from torch import nn
|
163 |
+
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel, Wav2Vec2Model
|
164 |
+
class RegressionHead(nn.Module):
|
165 |
+
r"""Classification head."""
|
166 |
+
|
167 |
+
def __init__(self, config):
|
168 |
+
|
169 |
+
super().__init__()
|
170 |
+
|
171 |
+
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
172 |
+
self.dropout = nn.Dropout(config.final_dropout)
|
173 |
+
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
|
174 |
+
|
175 |
+
def forward(self, features, **kwargs):
|
176 |
+
|
177 |
+
x = features
|
178 |
+
x = self.dropout(x)
|
179 |
+
x = self.dense(x)
|
180 |
+
x = torch.tanh(x)
|
181 |
+
x = self.dropout(x)
|
182 |
+
x = self.out_proj(x)
|
183 |
+
|
184 |
+
return x
|
185 |
+
|
186 |
+
|
187 |
+
class Dawn(Wav2Vec2PreTrainedModel):
|
188 |
+
r"""Speech emotion classifier."""
|
189 |
+
|
190 |
+
def __init__(self, config):
|
191 |
+
|
192 |
+
super().__init__(config)
|
193 |
+
|
194 |
+
self.config = config
|
195 |
+
self.wav2vec2 = Wav2Vec2Model(config)
|
196 |
+
self.classifier = RegressionHead(config)
|
197 |
+
self.init_weights()
|
198 |
+
|
199 |
+
def forward(
|
200 |
+
self,
|
201 |
+
input_values,
|
202 |
+
attention_mask=None,
|
203 |
+
):
|
204 |
+
x = _prenorm(input_values, attention_mask=attention_mask)
|
205 |
+
outputs = self.wav2vec2(x, attention_mask=attention_mask)
|
206 |
+
hidden_states = outputs[0]
|
207 |
+
hidden_states = torch.mean(hidden_states, dim=1)
|
208 |
+
logits = self.classifier(hidden_states)
|
209 |
+
return logits
|
210 |
+
# return {'hidden_states': hidden_states,
|
211 |
+
# 'logits': logits}
|
212 |
+
dawn = Dawn.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim').to(config.dev).eval()
|
213 |
+
# =======================================
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
|
|
|
|
|
220 |
|
221 |
def process_function(x, sampling_rate, idx):
|
222 |
'''run audioset ct, adv
|
|
|
236 |
# logits_audioset = audioset_model(x, 16000)['logits_sounds']
|
237 |
# logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
|
238 |
# --
|
239 |
+
logits_adv = dawn(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy() #['logits']
|
240 |
|
241 |
cat = np.concatenate([logits_adv,
|
242 |
# _sigmoid(logits_audioset),
|
|
|
251 |
# process_func_args={'outputs': 'logits_scene'},
|
252 |
process_func_applies_sliding_window=False,
|
253 |
win_dur=7.0,
|
254 |
+
hop_dur=40.0,
|
255 |
sampling_rate=16000,
|
256 |
resample=True,
|
257 |
verbose=True,
|
|
|
379 |
total_audio_mimic3 = []
|
380 |
total_audio_styletts2 = []
|
381 |
ix = 0
|
382 |
+
for list_of_10 in harvard_individual_sentences[:4]: # 77
|
383 |
|
384 |
text = ' '.join(list_of_10['sentences'])
|
385 |
|
style_vector/en_UK_apope.wav
ADDED
Binary file (99.9 kB). View file
|
|
style_vector/en_US_cmu_arctic_aew.wav
ADDED
Binary file (96.3 kB). View file
|
|
style_vector/en_US_cmu_arctic_ahw.wav
ADDED
Binary file (95.8 kB). View file
|
|
style_vector/en_US_cmu_arctic_aup.wav
ADDED
Binary file (90.2 kB). View file
|
|
style_vector/en_US_cmu_arctic_awbrms.wav
ADDED
Binary file (92.7 kB). View file
|
|
style_vector/en_US_cmu_arctic_axb.wav
ADDED
Binary file (92.2 kB). View file
|
|
style_vector/en_US_cmu_arctic_bdl.wav
ADDED
Binary file (90.7 kB). View file
|
|
style_vector/en_US_cmu_arctic_clb.wav
ADDED
Binary file (96.3 kB). View file
|
|
style_vector/en_US_cmu_arctic_eey.wav
ADDED
Binary file (90.7 kB). View file
|
|
style_vector/en_US_cmu_arctic_fem.wav
ADDED
Binary file (90.2 kB). View file
|
|
style_vector/en_US_cmu_arctic_gka.wav
ADDED
Binary file (90.7 kB). View file
|
|
style_vector/en_US_cmu_arctic_jmk.wav
ADDED
Binary file (92.7 kB). View file
|
|
style_vector/en_US_cmu_arctic_ksp.wav
ADDED
Binary file (93.7 kB). View file
|
|
style_vector/en_US_cmu_arctic_ljm.wav
ADDED
Binary file (89.1 kB). View file
|
|
style_vector/en_US_cmu_arctic_lnh.wav
ADDED
Binary file (91.2 kB). View file
|
|
style_vector/en_US_cmu_arctic_rxr.wav
ADDED
Binary file (93.2 kB). View file
|
|
style_vector/en_US_cmu_arctic_slp.wav
ADDED
Binary file (93.2 kB). View file
|
|
style_vector/en_US_cmu_arctic_slt.wav
ADDED
Binary file (92.2 kB). View file
|
|
style_vector/en_US_hifi-tts_6097.wav
ADDED
Binary file (89.1 kB). View file
|
|
style_vector/en_US_hifi-tts_9017.wav
ADDED
Binary file (88.6 kB). View file
|
|
style_vector/en_US_hifi-tts_92.wav
ADDED
Binary file (90.7 kB). View file
|
|
style_vector/en_US_ljspeech.wav
ADDED
Binary file (101 kB). View file
|
|
style_vector/en_US_m-ailabs_elliot_miller.wav
ADDED
Binary file (102 kB). View file
|
|
style_vector/en_US_m-ailabs_judy_bieber.wav
ADDED
Binary file (104 kB). View file
|
|
style_vector/en_US_m-ailabs_mary_ann.wav
ADDED
Binary file (103 kB). View file
|
|
style_vector/en_US_vctk_p225.wav
ADDED
Binary file (96.8 kB). View file
|
|
style_vector/en_US_vctk_p226.wav
ADDED
Binary file (98.3 kB). View file
|
|
style_vector/en_US_vctk_p227.wav
ADDED
Binary file (97.8 kB). View file
|
|
style_vector/en_US_vctk_p228.wav
ADDED
Binary file (94.8 kB). View file
|
|
style_vector/en_US_vctk_p229.wav
ADDED
Binary file (95.3 kB). View file
|
|
style_vector/en_US_vctk_p230.wav
ADDED
Binary file (95.8 kB). View file
|
|
style_vector/en_US_vctk_p231.wav
ADDED
Binary file (94.8 kB). View file
|
|
style_vector/en_US_vctk_p232.wav
ADDED
Binary file (93.7 kB). View file
|
|
style_vector/en_US_vctk_p233.wav
ADDED
Binary file (95.8 kB). View file
|
|
style_vector/en_US_vctk_p234.wav
ADDED
Binary file (95.8 kB). View file
|
|
style_vector/en_US_vctk_p236.wav
ADDED
Binary file (93.2 kB). View file
|
|
style_vector/en_US_vctk_p237.wav
ADDED
Binary file (95.3 kB). View file
|
|
style_vector/en_US_vctk_p238.wav
ADDED
Binary file (103 kB). View file
|
|
style_vector/en_US_vctk_p239.wav
ADDED
Binary file (94.8 kB). View file
|
|
style_vector/en_US_vctk_p240.wav
ADDED
Binary file (97.8 kB). View file
|
|
style_vector/en_US_vctk_p241.wav
ADDED
Binary file (93.2 kB). View file
|
|
style_vector/en_US_vctk_p243.wav
ADDED
Binary file (97.3 kB). View file
|
|
style_vector/en_US_vctk_p244.wav
ADDED
Binary file (93.7 kB). View file
|
|
style_vector/en_US_vctk_p245.wav
ADDED
Binary file (98.3 kB). View file
|
|
style_vector/en_US_vctk_p246.wav
ADDED
Binary file (98.3 kB). View file
|
|
style_vector/en_US_vctk_p247.wav
ADDED
Binary file (97.3 kB). View file
|
|
style_vector/en_US_vctk_p248.wav
ADDED
Binary file (102 kB). View file
|
|
style_vector/en_US_vctk_p249.wav
ADDED
Binary file (96.3 kB). View file
|
|
style_vector/en_US_vctk_p250.wav
ADDED
Binary file (93.2 kB). View file
|
|