File size: 6,726 Bytes
96094ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930b057
 
 
 
 
 
 
e9c1685
 
 
cf24559
 
 
1141fd4
930b057
 
e9c1685
 
 
930b057
 
 
f251112
e9c1685
930b057
 
f251112
96094ed
ad7e686
 
aeeb0c4
ad7e686
aeeb0c4
 
 
ad7e686
be89a6c
96094ed
e9c1685
96094ed
 
e9c1685
930b057
 
e9c1685
e4f5d4b
930b057
f251112
526db04
f251112
96094ed
6747ea1
930b057
 
e9c1685
930b057
 
 
 
e9c1685
 
 
 
 
 
f251112
e9c1685
 
6747ea1
96094ed
ad7e686
 
 
 
 
 
 
 
 
 
 
96094ed
 
ad7e686
865af48
 
 
96094ed
92dd64b
96094ed
930b057
 
 
 
 
 
cfcb145
8fef7bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930b057
96094ed
6747ea1
f251112
 
6747ea1
ad7e686
 
 
865af48
b44f87d
865af48
b44f87d
865af48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad7e686
e9c1685
f251112
96094ed
5b1172f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import torch

from transformers import pipeline

import numpy as np
import gradio as gr

def _grab_best_device(use_gpu=True):
    if torch.cuda.device_count() > 0 and use_gpu:
        device = "cuda"
    else:
        device = "cpu"
    return device

device = _grab_best_device()

default_model_per_language = {
    "english": "kakao-enterprise/vits-ljs",
    "spanish": "facebook/mms-tts-spa",
}

models_per_language = {
    "english": [
        ("Welsh Female Speaker", "ylacombe/vits_ljs_welsh_female_monospeaker_2"),
        ("Welsh Male Speaker", "ylacombe/vits_ljs_welsh_male_monospeaker_2"),
        ("Scottish Female Speaker", "ylacombe/vits_ljs_scottish_female_monospeaker"),
        ("Northern Female Speaker", "ylacombe/vits_ljs_northern_female_monospeaker"),
        ("Midlands Male Speaker", "ylacombe/vits_ljs_midlands_male_monospeaker"),
        ("Southern Male Speaker", "ylacombe/vits_ljs_southern_male_monospeaker"),
        ("Irish Male Speaker", "ylacombe/vits_ljs_irish_male_monospeaker_2"),
    ],
    "spanish": [
        ("Male Chilean Speaker", "ylacombe/mms-spa-finetuned-chilean-monospeaker"),       
        ("Female Argentinian Speaker", "ylacombe/mms-spa-finetuned-argentinian-monospeaker"),       
        ("Male Colombian Speaker", "ylacombe/mms-spa-finetuned-colombian-monospeaker"),       
    ],
}

pipe_dict = {
    "pipe":  [pipeline("text-to-speech", model=l[1], device=0) for l in models_per_language["english"]],
    "original_pipe": pipeline("text-to-speech", model=default_model_per_language["english"], device=0),
    "language": "english",
}

title =      """# Explore English and Spanish Accents with VITS finetuning
            ## Or how the best wine comes in old bottles
            
            [VITS](https://huggingface.co/docs/transformers/model_doc/vits) is a light weight, low-latency TTS model. 
            
            Coupled with the right data and the right training recipe, you can get an excellent finetuned version in **20 minutes** with as little as **80 to 150 samples**.            
            
            Stay tuned, the training recipe is coming soon!
            """

max_speakers = 15

# Inference
def generate_audio(text, language):

    if pipe_dict["language"] != language:
        gr.Warning(f"Language has changed - loading corresponding models: {default_model_per_language[language]}")
        pipe_dict["language"] = language
        pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=0)
    
        pipe_dict["pipe"] = [pipeline("text-to-speech", model=l[1], device=0) for l in models_per_language[language]]


    out = []
    # first generate original model result
    output = pipe_dict["original_pipe"](text)
    output =  gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Prediction from the original checkpoint {default_model_per_language[language]}", show_label=True,
                               visible=True)
    out.append(output)
    
    
    for i in range(min(len(pipe_dict["pipe"]), max_speakers - 1)):
        
        output = pipe_dict["pipe"][i](text)
        
        output =  gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Finetuned {models_per_language[language][i][0]}", show_label=True,
                           visible=True)
        out.append(output)
    
    out.extend([gr.Audio(visible=False)]*(max_speakers-(len(out))))
    return out

css = """
#container{
    margin: 0 auto;
    max-width: 80rem;
}
#intro{
    max-width: 100%;
    text-align: center;
    margin: 0 auto;
}
"""

# Gradio blocks demo    
with gr.Blocks(css=css) as demo_blocks:
    gr.Markdown(title, elem_id="intro")

    with gr.Row():
        with gr.Column():
            inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?")
            btn = gr.Button("Generate Audio!")
            language = gr.Dropdown(
                default_model_per_language.keys(),
                value = "english",
                label = "language",
                info = "Language that you want to test"
            )

            gr.Markdown("""
    ## Datasets and models details
    
    ### English
    
    * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
    * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
    
    ### Spanish
    
    * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa). This model is part of Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project, aiming to
    provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
    and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
    * **Datasets**:  For each accent, we used 100 to 150 samples of a single speaker to finetune the model.
        - [Colombian Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-colombian-spanish).
        - [Argentinian Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-argentinian-spanish).
        - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
                
                """) 
            
        with gr.Column():
            outputs = []
            for i in range(max_speakers):
                out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
                outputs.append(out_audio)




    with gr.Accordion("Run VITS and MMS with transformers"):
        gr.Markdown(
            """
        ```bash
        pip install transformers
        ```
        ```py
        from transformers import pipeline
        import scipy
        pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
        
        results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")

        # write to a wav file
        scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
        ```
        """
        )

    btn.click(generate_audio, [inp_text, language], outputs)
    

demo_blocks.queue().launch()