File size: 4,874 Bytes
b585c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os

import pytest
from tests.utils import wrap_test_forked
from src.tts_sentence_parsing import init_sentence_state
from tests.test_sentence_parsing import bot_list


@pytest.mark.audio
@wrap_test_forked
def test_sentence_to_wave():
    os.environ['CUDA_HOME'] = '/usr/local/cuda-11.7'
    from src.tts_coqui import sentence_to_wave, get_xtt, get_latent, get_role_to_wave_map

    chatbot_role = "Female AI Assistant"
    sentence = "I am an AI assistant.  I can help you with any tasks."
    # supported_languages = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
    tts_speed = 1.0
    model, supported_languages = get_xtt()
    latent = get_latent(get_role_to_wave_map()[chatbot_role], model=model)
    generated_speech = sentence_to_wave(sentence,
                                        supported_languages,
                                        tts_speed,
                                        latent=latent,
                                        model=model,
                                        return_as_byte=False,
                                        return_nonbyte_as_file=True,
                                        return_gradio=False)
    print(generated_speech, flush=True)

    # confirm file is valid wave file
    import wave
    with wave.open(generated_speech, mode='rb') as f:
        pass


@pytest.mark.audio
@wrap_test_forked
def test_generate_speech():
    os.environ['CUDA_HOME'] = '/usr/local/cuda-11.7'
    from src.tts_coqui import generate_speech, get_xtt, get_latent, get_role_to_wave_map

    chatbot_role = "Female AI Assistant"
    model, supported_languages = get_xtt()
    latent = get_latent(get_role_to_wave_map()[chatbot_role], model=model)

    response = 'I am an AI assistant.  What do you want from me?  I am very busy.'
    for char in response:
        generate_speech(char, model=model, supported_languages=supported_languages, latent=latent)


@pytest.mark.audio
@wrap_test_forked
def test_full_generate_speech():
    os.environ['CUDA_HOME'] = '/usr/local/cuda-11.7'
    from src.tts_coqui import generate_speech, get_xtt, get_latent, get_role_to_wave_map
    bot = 'I am an AI assistant.  What do you want from me?  I am very busy.'

    def response_gen():
        for word1 in bot.split(' '):
            yield word1

    chatbot_role = "Female AI Assistant"
    model, supported_languages = get_xtt()
    latent = get_latent(get_role_to_wave_map()[chatbot_role], model=model)

    response = ""
    sentence_state = init_sentence_state()

    sentences = []
    audios = []
    sentences_expected = ['I am an AI assistant.', 'What do you want from me?', 'I am very busy.']
    for word in response_gen():
        response += word + ' '
        audio, sentence, sentence_state = \
            generate_speech(response,
                            model=model,
                            supported_languages=supported_languages,
                            latent=latent,
                            sentence_state=sentence_state,
                            return_as_byte=False,
                            return_nonbyte_as_file=True,
                            return_gradio=False,
                            is_final=False, verbose=True)
        if sentence is not None:
            print(sentence)
            sentences.append(sentence)
        if audio is not None:
            audios.append(audio)
    audio, sentence, sentence_state = \
        generate_speech(response,
                        model=model,
                        supported_languages=supported_languages,
                        latent=latent,
                        sentence_state=sentence_state,
                        return_as_byte=False,
                        return_nonbyte_as_file=True,
                        return_gradio=False,
                        is_final=True, verbose=True)
    if sentence is not None:
        print(sentence)
        sentences.append(sentence)
    if audio is not None:
        audios.append(audio)
    assert sentences == sentences_expected
    assert len(sentences) == len(audios)
    print(audios)


@pytest.mark.audio
@wrap_test_forked
@pytest.mark.parametrize("bot, sentences_expected", bot_list)
def test_predict_from_text(bot, sentences_expected):
    speeches = []
    from src.tts import get_tts_model, get_speakers
    processor, model, vocoder = get_tts_model()
    speaker = get_speakers()[0]
    tts_speed = 1.0

    from src.tts import predict_from_text
    for audio in predict_from_text(bot, speaker, tts_speed,
                                   processor=processor, model=model, vocoder=vocoder,
                                   return_as_byte=False,
                                   verbose=True):
        if audio[1].shape[0] > 0:
            speeches.append(audio)
    assert len(speeches) == len(sentences_expected)