File size: 4,846 Bytes
53f8a32
 
 
 
 
 
5b5d4c0
 
53f8a32
 
f4fe081
53f8a32
 
 
9846d74
53f8a32
f4fe081
 
53f8a32
 
9ef73e2
 
 
 
f4fe081
9846d74
f4fe081
 
 
 
 
 
9846d74
53f8a32
 
 
 
 
 
 
 
 
 
 
80ee0e5
53f8a32
 
 
0f11bd1
 
53f8a32
 
fbe7d93
53f8a32
 
 
fbe7d93
babf22d
37e87fa
53f8a32
 
 
 
fbe7d93
53f8a32
 
 
 
9ef73e2
 
 
53f8a32
6473463
9ef73e2
 
 
ce55168
f4fe081
 
 
 
 
 
 
 
 
9ef73e2
2d5fa2d
53f8a32
2d5fa2d
53f8a32
 
 
 
fbe7d93
53f8a32
fbe7d93
53f8a32
 
9ef73e2
53f8a32
 
 
 
 
 
9ef73e2
 
53f8a32
9ef73e2
f4fe081
 
 
 
 
 
 
 
 
 
 
 
9ef73e2
53f8a32
9ef73e2
53f8a32
4263bcd
53f8a32
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Copyright 2022 Balacoon

TTS interactive demo
"""

import os
import glob
import logging
from typing import cast
from threading import Lock

import gradio as gr
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files

# locker that disallow access to the tts object from more then one thread
locker = Lock()
# global tts module, initialized from a model selected
tts = None
# path to the model that is currently used in tts
cur_model_path = None
# cache of speakers, maps model name to speaker list
model_to_speakers = dict()
model_repo_dir = "/data"
for name in list_repo_files(repo_id="balacoon/tts"):
    if not os.path.isfile(os.path.join(model_repo_dir, name)):
        hf_hub_download(
            repo_id="balacoon/tts",
            filename=name,
            local_dir=model_repo_dir,
        )


def main():
    logging.basicConfig(level=logging.INFO)

    with gr.Blocks() as demo:
        gr.Markdown(
            """
            <h1 align="center">Balacoon🦝 Text-to-Speech</h1>

            1. Write an utterance to generate,
            2. Select the model to synthesize with
            3. Select speaker
            4. Hit "Generate" and listen to the result!

            You can learn more about models available
            [here](https://huggingface.co/balacoon/tts).
            Visit [Balacoon website](https://balacoon.com/) for more info.
            """
        )
        with gr.Row(variant="panel"):
            text = gr.Textbox(label="Text", placeholder="Type something here...")

        with gr.Row():
            with gr.Column(variant="panel"):
                repo_files = os.listdir(model_repo_dir)
                model_files = [x for x in repo_files if x.endswith("_cpu.addon")]
                model_name = gr.Dropdown(
                    label="Model",
                    choices=model_files,
                )
            with gr.Column(variant="panel"):
                speaker = gr.Dropdown(label="Speaker", choices=[])

            def set_model(model_name_str: str):
                """
                gets value from `model_name`. either
                uses cached list of speakers for the given model name
                or loads the addon and checks what are the speakers.
                """
                global model_to_speakers
                if model_name_str in model_to_speakers:
                    speakers = model_to_speakers[model_name_str]
                else:
                    global tts, cur_model_path, locker
                    with locker:
                        # need to load this model to learn the list of speakers
                        model_path = os.path.join(model_repo_dir, model_name_str)
                        if tts is not None:
                            del tts
                        tts = TTS(model_path)
                        cur_model_path = model_path
                        speakers = tts.get_speakers()
                        model_to_speakers[model_name_str] = speakers
                
                value = speakers[-1]
                return gr.Dropdown.update(
                    choices=speakers, value=value, visible=True
                )

            model_name.change(set_model, inputs=model_name, outputs=speaker)

        with gr.Row(variant="panel"):
            generate = gr.Button("Generate")
        with gr.Row(variant="panel"):
            audio = gr.Audio()

        def synthesize_audio(text_str: str, model_name_str: str, speaker_str: str):
            """
            gets utterance to synthesize from `text` Textbox
            and speaker name from `speaker` dropdown list.
            speaker name might be empty for single-speaker models.
            Synthesizes the waveform and updates `audio` with it.
            """
            if not text_str or not model_name_str or not speaker_str:
                logging.info("text, model name or speaker are not provided")
                return None
            expected_model_path = os.path.join(model_repo_dir, model_name_str)
            global tts, cur_model_path, locker
            with locker:
                if expected_model_path != cur_model_path:
                    # reload model
                    if tts is not None:
                        del tts
                    tts = TTS(expected_model_path)
                    cur_model_path = expected_model_path
                if len(text_str) > 1024:
                    # truncate the text
                    text_str = text_str[:1024]
                samples = tts.synthesize(text_str, speaker_str)
            return gr.Audio.update(value=(tts.get_sampling_rate(), samples))

        generate.click(synthesize_audio, inputs=[text, model_name, speaker], outputs=audio)

    demo.queue(concurrency_count=1).launch()


if __name__ == "__main__":
    main()