|
|
|
|
|
|
|
|
|
|
|
import soundfile |
|
import json |
|
import numpy as np |
|
import audb |
|
from pathlib import Path |
|
|
|
LABELS = ['arousal', 'dominance', 'valence'] |
|
|
|
|
|
|
|
|
|
def load_speech(split=None): |
|
DB = [ |
|
|
|
|
|
|
|
['emodb', '1.2.0', 'emotion.categories.train.gold_standard', False], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
|
output_list = [] |
|
for database_name, ver, table, has_timedeltas in DB: |
|
|
|
a = audb.load(database_name, |
|
sampling_rate=16000, |
|
format='wav', |
|
mixdown=True, |
|
version=ver, |
|
cache_root='/cache/audb/') |
|
a = a[table].get() |
|
if has_timedeltas: |
|
print(f'{has_timedeltas=}') |
|
|
|
|
|
|
|
else: |
|
output_list += [f for f in a.index] |
|
return output_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
natural_wav_paths = load_speech() |
|
|
|
|
|
|
|
import msinference |
|
import os |
|
from random import shuffle |
|
import audiofile |
|
with open('harvard.json', 'r') as f: |
|
harvard_individual_sentences = json.load(f)['sentences'] |
|
|
|
|
|
|
|
synthetic_wav_paths = ['./enslow/' + i for i in |
|
os.listdir('./enslow/')] |
|
synthetic_wav_paths_4x = ['./style_vector_v2/' + i for i in |
|
os.listdir('./style_vector_v2/')] |
|
synthetic_wav_paths_foreign = ['./mimic3_foreign/' + i for i in os.listdir('./mimic3_foreign/') if 'en_U' not in i] |
|
synthetic_wav_paths_foreign_4x = ['./mimic3_foreign_4x/' + i for i in os.listdir('./mimic3_foreign_4x/') if 'en_U' not in i] |
|
|
|
|
|
synthetic_wav_paths_foreign = [i for i in synthetic_wav_paths_foreign if audiofile.duration(i) > 2] |
|
synthetic_wav_paths_foreign_4x = [i for i in synthetic_wav_paths_foreign_4x if audiofile.duration(i) > 2] |
|
synthetic_wav_paths = [i for i in synthetic_wav_paths if audiofile.duration(i) > 2] |
|
synthetic_wav_pathsn_4x = [i for i in synthetic_wav_paths_4x if audiofile.duration(i) > 2] |
|
|
|
shuffle(synthetic_wav_paths_foreign_4x) |
|
shuffle(synthetic_wav_paths_foreign) |
|
shuffle(synthetic_wav_paths) |
|
shuffle(synthetic_wav_paths_4x) |
|
print(len(synthetic_wav_paths_foreign_4x), len(synthetic_wav_paths_foreign), |
|
len(synthetic_wav_paths), len(synthetic_wav_paths_4x)) |
|
for audio_prompt in ['english', |
|
'english_4x', |
|
'human', |
|
'foreign', |
|
'foreign_4x']: |
|
OUT_FILE = f'{audio_prompt}_hfullh.wav' |
|
if not os.path.isfile(OUT_FILE): |
|
total_audio = [] |
|
total_style = [] |
|
ix = 0 |
|
for list_of_10 in harvard_individual_sentences[:1000]: |
|
|
|
|
|
for text in list_of_10['sentences']: |
|
if audio_prompt == 'english': |
|
_p = synthetic_wav_paths[ix % len(synthetic_wav_paths)] |
|
style_vec = msinference.compute_style(_p) |
|
elif audio_prompt == 'english_4x': |
|
_p = synthetic_wav_paths_4x[ix % len(synthetic_wav_paths_4x)] |
|
style_vec = msinference.compute_style(_p) |
|
elif audio_prompt == 'human': |
|
_p = natural_wav_paths[ix % len(natural_wav_paths)] |
|
style_vec = msinference.compute_style(_p) |
|
elif audio_prompt == 'foreign': |
|
_p = synthetic_wav_paths_foreign[ix % len(synthetic_wav_paths_foreign)] |
|
style_vec = msinference.compute_style(_p) |
|
elif audio_prompt == 'foreign_4x': |
|
_p = synthetic_wav_paths_foreign_4x[ix % len(synthetic_wav_paths_foreign_4x)] |
|
style_vec = msinference.compute_style(_p) |
|
else: |
|
print('unknonw list of style vector') |
|
print(ix, text) |
|
ix += 1 |
|
x = msinference.inference(text, |
|
style_vec, |
|
alpha=0.3, |
|
beta=0.7, |
|
diffusion_steps=7, |
|
embedding_scale=1) |
|
|
|
total_audio.append(x) |
|
_st, fsr = audiofile.read(_p) |
|
total_style.append(_st[:len(x)]) |
|
|
|
|
|
print('_____________________') |
|
|
|
total_audio = np.concatenate(total_audio) |
|
soundfile.write(OUT_FILE, total_audio, 24000) |
|
total_style = np.concatenate(total_style) |
|
soundfile.write('_st_' + OUT_FILE, total_style, fsr) |
|
|
|
else: |
|
print('\nALREADY EXISTS\n') |
|
|