test / tests /test_tts.py
iblfe's picture
Upload folder using huggingface_hub
b585c7f verified
import os
import pytest
from tests.utils import wrap_test_forked
from src.tts_sentence_parsing import init_sentence_state
from tests.test_sentence_parsing import bot_list
@pytest.mark.audio
@wrap_test_forked
def test_sentence_to_wave():
os.environ['CUDA_HOME'] = '/usr/local/cuda-11.7'
from src.tts_coqui import sentence_to_wave, get_xtt, get_latent, get_role_to_wave_map
chatbot_role = "Female AI Assistant"
sentence = "I am an AI assistant. I can help you with any tasks."
# supported_languages = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
tts_speed = 1.0
model, supported_languages = get_xtt()
latent = get_latent(get_role_to_wave_map()[chatbot_role], model=model)
generated_speech = sentence_to_wave(sentence,
supported_languages,
tts_speed,
latent=latent,
model=model,
return_as_byte=False,
return_nonbyte_as_file=True,
return_gradio=False)
print(generated_speech, flush=True)
# confirm file is valid wave file
import wave
with wave.open(generated_speech, mode='rb') as f:
pass
@pytest.mark.audio
@wrap_test_forked
def test_generate_speech():
os.environ['CUDA_HOME'] = '/usr/local/cuda-11.7'
from src.tts_coqui import generate_speech, get_xtt, get_latent, get_role_to_wave_map
chatbot_role = "Female AI Assistant"
model, supported_languages = get_xtt()
latent = get_latent(get_role_to_wave_map()[chatbot_role], model=model)
response = 'I am an AI assistant. What do you want from me? I am very busy.'
for char in response:
generate_speech(char, model=model, supported_languages=supported_languages, latent=latent)
@pytest.mark.audio
@wrap_test_forked
def test_full_generate_speech():
os.environ['CUDA_HOME'] = '/usr/local/cuda-11.7'
from src.tts_coqui import generate_speech, get_xtt, get_latent, get_role_to_wave_map
bot = 'I am an AI assistant. What do you want from me? I am very busy.'
def response_gen():
for word1 in bot.split(' '):
yield word1
chatbot_role = "Female AI Assistant"
model, supported_languages = get_xtt()
latent = get_latent(get_role_to_wave_map()[chatbot_role], model=model)
response = ""
sentence_state = init_sentence_state()
sentences = []
audios = []
sentences_expected = ['I am an AI assistant.', 'What do you want from me?', 'I am very busy.']
for word in response_gen():
response += word + ' '
audio, sentence, sentence_state = \
generate_speech(response,
model=model,
supported_languages=supported_languages,
latent=latent,
sentence_state=sentence_state,
return_as_byte=False,
return_nonbyte_as_file=True,
return_gradio=False,
is_final=False, verbose=True)
if sentence is not None:
print(sentence)
sentences.append(sentence)
if audio is not None:
audios.append(audio)
audio, sentence, sentence_state = \
generate_speech(response,
model=model,
supported_languages=supported_languages,
latent=latent,
sentence_state=sentence_state,
return_as_byte=False,
return_nonbyte_as_file=True,
return_gradio=False,
is_final=True, verbose=True)
if sentence is not None:
print(sentence)
sentences.append(sentence)
if audio is not None:
audios.append(audio)
assert sentences == sentences_expected
assert len(sentences) == len(audios)
print(audios)
@pytest.mark.audio
@wrap_test_forked
@pytest.mark.parametrize("bot, sentences_expected", bot_list)
def test_predict_from_text(bot, sentences_expected):
speeches = []
from src.tts import get_tts_model, get_speakers
processor, model, vocoder = get_tts_model()
speaker = get_speakers()[0]
tts_speed = 1.0
from src.tts import predict_from_text
for audio in predict_from_text(bot, speaker, tts_speed,
processor=processor, model=model, vocoder=vocoder,
return_as_byte=False,
verbose=True):
if audio[1].shape[0] > 0:
speeches.append(audio)
assert len(speeches) == len(sentences_expected)