style = mimic3 generation
Browse files- mimic3_make_harvard_sentences.py +68 -69
mimic3_make_harvard_sentences.py
CHANGED
@@ -34,41 +34,44 @@ import audiofile
|
|
34 |
|
35 |
|
36 |
# ================================================ LIST OF VOICES
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
#
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
# ====================================================== LIST Mimic-3 ALL VOICES
|
67 |
list_voices = [
|
68 |
'en_US/m-ailabs_low#mary_ann',
|
69 |
'en_UK/apope_low',
|
70 |
'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
|
71 |
-
'
|
|
|
|
|
|
|
72 |
] # special - for human we load specific style file - no Mimic3 is run
|
73 |
|
74 |
|
@@ -293,7 +296,7 @@ for _id, _voice in enumerate(list_voices):
|
|
293 |
total_audio_mimic3 = []
|
294 |
total_audio_styletts2 = []
|
295 |
ix = 0
|
296 |
-
for list_of_10 in harvard_individual_sentences[:
|
297 |
|
298 |
text = ' '.join(list_of_10['sentences'])
|
299 |
|
@@ -312,7 +315,7 @@ for _id, _voice in enumerate(list_voices):
|
|
312 |
f'<prosody rate=\'{rate}\'>'
|
313 |
f'<voice name=\'{_voice}\'>'
|
314 |
'<s>'
|
315 |
-
f'{text}'
|
316 |
'</s>'
|
317 |
'</voice>'
|
318 |
'</prosody>'
|
@@ -353,7 +356,9 @@ for _id, _voice in enumerate(list_voices):
|
|
353 |
# style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
|
354 |
# --
|
355 |
# MSP['emotion.test-1'].get().sort_values('valence').index[-1]
|
356 |
-
style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0220_0870.wav'
|
|
|
|
|
357 |
x, fs = audiofile.read(style_path) # assure is not very short - equl harvard sent len
|
358 |
print(x.shape,' human') # crop human to almost mimic-3 duration
|
359 |
total_audio_mimic3.append(x)
|
@@ -426,7 +431,7 @@ for _id, _voice in enumerate(list_voices):
|
|
426 |
|
427 |
|
428 |
|
429 |
-
|
430 |
print('\nVisuals\n')
|
431 |
|
432 |
# ===============================================================================
|
@@ -475,32 +480,39 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
475 |
p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
|
476 |
vis_df[k] = p
|
477 |
preds = vis_df
|
478 |
-
fig, ax = plt.subplots(nrows=
|
479 |
|
480 |
|
481 |
# ADV - subplots
|
482 |
|
483 |
-
time_stamp = preds[f'mimic3_{
|
484 |
for j, dim in enumerate(['arousal',
|
485 |
'dominance',
|
486 |
'valence']):
|
487 |
|
488 |
# MIMIC3
|
489 |
|
490 |
-
ax[j, 0].plot(time_stamp,
|
|
|
|
|
491 |
color=(0,104/255,139/255),
|
492 |
label='mean_1',
|
493 |
linewidth=2)
|
|
|
|
|
|
|
|
|
|
|
494 |
ax[j, 0].fill_between(time_stamp,
|
495 |
|
496 |
preds[f'styletts2_{_str1}'][dim],
|
497 |
preds[f'mimic3_{_str1}'][dim],
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
if j == 0:
|
502 |
-
ax[j, 0].legend([f'
|
503 |
-
|
504 |
prop={'size': 10},
|
505 |
# loc='lower right'
|
506 |
)
|
@@ -508,8 +520,6 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
508 |
|
509 |
# TICK
|
510 |
ax[j, 0].set_ylim([1e-7, .9999])
|
511 |
-
# ax[j, 0].set_yticks([.25, .5,.75])
|
512 |
-
# ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
|
513 |
ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
|
514 |
ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
|
515 |
|
@@ -517,21 +527,20 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
517 |
# MIMIC3 4x speed
|
518 |
|
519 |
|
520 |
-
ax[j, 1].plot(time_stamp, preds[f'
|
521 |
color=(0,104/255,139/255),
|
522 |
label='mean_1',
|
523 |
linewidth=2)
|
524 |
ax[j, 1].fill_between(time_stamp,
|
525 |
-
|
526 |
-
preds[f'styletts2_{_str2}'][dim],
|
527 |
preds[f'mimic3_{_str2}'][dim],
|
528 |
-
|
529 |
-
color=(.
|
530 |
-
alpha
|
531 |
if j == 0:
|
532 |
-
ax[j, 1].legend([
|
533 |
-
|
534 |
-
|
|
|
535 |
# loc='lower right'
|
536 |
)
|
537 |
|
@@ -561,34 +570,25 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
561 |
for j, dim in enumerate(['Angry',
|
562 |
'Sad',
|
563 |
'Happy',
|
564 |
-
'Surprise',
|
565 |
'Fear',
|
566 |
'Disgust',
|
567 |
-
'Contempt',
|
568 |
# 'Neutral'
|
569 |
]): # ASaHSuFDCN
|
570 |
j = j + 3 # skip A/D/V suplt
|
571 |
|
572 |
# MIMIC3
|
573 |
|
574 |
-
ax[j, 0].plot(time_stamp, preds[f'
|
575 |
color=(0,104/255,139/255),
|
576 |
label='mean_1',
|
577 |
linewidth=2)
|
578 |
ax[j, 0].fill_between(time_stamp,
|
579 |
-
|
580 |
-
preds[f'mimic3_{_str2}'][dim],
|
581 |
preds[f'styletts2_{_str2}'][dim],
|
582 |
-
|
583 |
-
color=(.
|
584 |
-
alpha
|
585 |
-
# ax[j, 0].legend(['StyleTTS2 style mimic3',
|
586 |
-
# 'StyleTTS2 style crema-d'],
|
587 |
-
# prop={'size': 10},
|
588 |
-
# # loc='upper left'
|
589 |
-
# )
|
590 |
-
|
591 |
-
|
592 |
ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
|
593 |
|
594 |
# TICKS
|
@@ -601,7 +601,7 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
601 |
# MIMIC3 4x speed
|
602 |
|
603 |
|
604 |
-
ax[j, 1].plot(time_stamp, preds[f'
|
605 |
color=(0,104/255,139/255),
|
606 |
label='mean_1',
|
607 |
linewidth=2)
|
@@ -609,9 +609,8 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
|
|
609 |
|
610 |
preds[f'mimic3_{_str2}'][dim],
|
611 |
preds[f'styletts2_{_str2}'][dim],
|
612 |
-
|
613 |
-
|
614 |
-
alpha=0.244)
|
615 |
# ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
|
616 |
# 'StyleTTS2 style crema-d'],
|
617 |
# prop={'size': 10},
|
|
|
34 |
|
35 |
|
36 |
# ================================================ LIST OF VOICES
|
37 |
+
ROOT_DIR = '/data/dkounadis/mimic3-voices/'
|
38 |
+
foreign_voices = []
|
39 |
+
english_voices = []
|
40 |
+
for lang in os.listdir(ROOT_DIR + 'voices'):
|
41 |
|
42 |
+
for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
|
43 |
+
if 'en_' in lang:
|
44 |
+
|
45 |
+
try:
|
46 |
+
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
|
47 |
+
for spk in f:
|
48 |
+
english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
|
49 |
+
# voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
|
50 |
+
except FileNotFoundError:
|
51 |
+
english_voices.append(lang + '/' + voice)
|
52 |
+
|
53 |
+
else:
|
54 |
|
55 |
+
try:
|
56 |
+
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
|
57 |
+
for spk in f:
|
58 |
+
foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
|
59 |
|
60 |
+
except FileNotFoundError:
|
61 |
+
foreign_voices.append(lang + '/' + voice)
|
62 |
+
#
|
63 |
+
[print(i) for i in foreign_voices]
|
64 |
+
print('\n_______________________________\n')
|
65 |
+
[print(i) for i in english_voices]
|
66 |
# ====================================================== LIST Mimic-3 ALL VOICES
|
67 |
list_voices = [
|
68 |
'en_US/m-ailabs_low#mary_ann',
|
69 |
'en_UK/apope_low',
|
70 |
'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
|
71 |
+
# 'ko_KO/kss_low',
|
72 |
+
'fr_FR/m-ailabs_low#gilles_g_le_blanc',
|
73 |
+
|
74 |
+
#'human',
|
75 |
] # special - for human we load specific style file - no Mimic3 is run
|
76 |
|
77 |
|
|
|
296 |
total_audio_mimic3 = []
|
297 |
total_audio_styletts2 = []
|
298 |
ix = 0
|
299 |
+
for list_of_10 in harvard_individual_sentences[:4]: # 77
|
300 |
|
301 |
text = ' '.join(list_of_10['sentences'])
|
302 |
|
|
|
315 |
f'<prosody rate=\'{rate}\'>'
|
316 |
f'<voice name=\'{_voice}\'>'
|
317 |
'<s>'
|
318 |
+
f'{text[:-1] + ", .. !!!"}'
|
319 |
'</s>'
|
320 |
'</voice>'
|
321 |
'</prosody>'
|
|
|
356 |
# style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
|
357 |
# --
|
358 |
# MSP['emotion.test-1'].get().sort_values('valence').index[-1]
|
359 |
+
# style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0220_0870.wav'
|
360 |
+
# --
|
361 |
+
style_path = '/cache/audb/librispeech/3.1.0/fe182b91/test-clean/3575/170457/3575-170457-0024.wav'
|
362 |
x, fs = audiofile.read(style_path) # assure is not very short - equl harvard sent len
|
363 |
print(x.shape,' human') # crop human to almost mimic-3 duration
|
364 |
total_audio_mimic3.append(x)
|
|
|
431 |
|
432 |
|
433 |
|
434 |
+
|
435 |
print('\nVisuals\n')
|
436 |
|
437 |
# ===============================================================================
|
|
|
480 |
p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
|
481 |
vis_df[k] = p
|
482 |
preds = vis_df
|
483 |
+
fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(24, 19.2), gridspec_kw={'hspace': 0, 'wspace': .04})
|
484 |
|
485 |
|
486 |
# ADV - subplots
|
487 |
|
488 |
+
time_stamp = preds[f'mimic3_{_str1}'].index.to_numpy()
|
489 |
for j, dim in enumerate(['arousal',
|
490 |
'dominance',
|
491 |
'valence']):
|
492 |
|
493 |
# MIMIC3
|
494 |
|
495 |
+
ax[j, 0].plot(time_stamp,
|
496 |
+
# np.ones_like(time_stamp) * .4, --> to find the line on the legend
|
497 |
+
preds[f'styletts2_{_str1}'][dim], # THIS IS THE BLUE LINE VERIFIED
|
498 |
color=(0,104/255,139/255),
|
499 |
label='mean_1',
|
500 |
linewidth=2)
|
501 |
+
# ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str1}'][dim],
|
502 |
+
# color=(.2, .2, .2),
|
503 |
+
# label='mean_1',
|
504 |
+
# linewidth=2,
|
505 |
+
# marker='o')
|
506 |
ax[j, 0].fill_between(time_stamp,
|
507 |
|
508 |
preds[f'styletts2_{_str1}'][dim],
|
509 |
preds[f'mimic3_{_str1}'][dim],
|
510 |
+
color=(.5,.5,.5),
|
511 |
+
alpha=.4
|
512 |
+
)
|
513 |
if j == 0:
|
514 |
+
ax[j, 0].legend([f'StyleTTS2 using {_str1}',
|
515 |
+
f'mimic3_{_str1}'],
|
516 |
prop={'size': 10},
|
517 |
# loc='lower right'
|
518 |
)
|
|
|
520 |
|
521 |
# TICK
|
522 |
ax[j, 0].set_ylim([1e-7, .9999])
|
|
|
|
|
523 |
ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
|
524 |
ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
|
525 |
|
|
|
527 |
# MIMIC3 4x speed
|
528 |
|
529 |
|
530 |
+
ax[j, 1].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
|
531 |
color=(0,104/255,139/255),
|
532 |
label='mean_1',
|
533 |
linewidth=2)
|
534 |
ax[j, 1].fill_between(time_stamp,
|
|
|
|
|
535 |
preds[f'mimic3_{_str2}'][dim],
|
536 |
+
preds[f'styletts2_{_str2}'][dim],
|
537 |
+
color=(.5,.5,.5),
|
538 |
+
alpha=.4)
|
539 |
if j == 0:
|
540 |
+
ax[j, 1].legend([
|
541 |
+
f'StyleTTS2 using {_str2}',
|
542 |
+
f'mimic3_{_str2}'],
|
543 |
+
prop={'size': 10},
|
544 |
# loc='lower right'
|
545 |
)
|
546 |
|
|
|
570 |
for j, dim in enumerate(['Angry',
|
571 |
'Sad',
|
572 |
'Happy',
|
573 |
+
# 'Surprise',
|
574 |
'Fear',
|
575 |
'Disgust',
|
576 |
+
# 'Contempt',
|
577 |
# 'Neutral'
|
578 |
]): # ASaHSuFDCN
|
579 |
j = j + 3 # skip A/D/V suplt
|
580 |
|
581 |
# MIMIC3
|
582 |
|
583 |
+
ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
|
584 |
color=(0,104/255,139/255),
|
585 |
label='mean_1',
|
586 |
linewidth=2)
|
587 |
ax[j, 0].fill_between(time_stamp,
|
|
|
|
|
588 |
preds[f'styletts2_{_str2}'][dim],
|
589 |
+
preds[f'mimic3_{_str2}'][dim],
|
590 |
+
color=(.5,.5,.5),
|
591 |
+
alpha=.4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
|
593 |
|
594 |
# TICKS
|
|
|
601 |
# MIMIC3 4x speed
|
602 |
|
603 |
|
604 |
+
ax[j, 1].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
|
605 |
color=(0,104/255,139/255),
|
606 |
label='mean_1',
|
607 |
linewidth=2)
|
|
|
609 |
|
610 |
preds[f'mimic3_{_str2}'][dim],
|
611 |
preds[f'styletts2_{_str2}'][dim],
|
612 |
+
color=(.5,.5,.5),
|
613 |
+
alpha=.4)
|
|
|
614 |
# ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
|
615 |
# 'StyleTTS2 style crema-d'],
|
616 |
# prop={'size': 10},
|