File size: 5,019 Bytes
1c3a7ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
import numpy as np
import torch
import os
import re
import tempfile

from transformers import VitsModel, VitsTokenizer


models = {
    "English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"),
    "German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"),
    "Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"),
}

tokenizers = {
    "English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"),
    "German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"),
    "Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"),
}


# For certain checkpoints, the text needs to be romanized.
# MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman
# This needs to be installed in the folder "uroman"
def uromanize(text, uroman_pl):
    iso = "xxx"
    with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
        with open(tf.name, "w") as f:
            f.write("\n".join([text]))
        cmd = f"perl " + uroman_pl
        cmd += f" -l {iso} "
        cmd += f" < {tf.name} > {tf2.name}"
        os.system(cmd)
        outtexts = []
        with open(tf2.name) as f:
            for line in f:
                line = re.sub(r"\s+", " ", line).strip()
                outtexts.append(line)
        outtext = outtexts[0]
    return outtext


def predict(text, language=None):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))

    if language == "Korean":
        uroman_pl = os.path.join("uroman", "bin", "uroman.pl")
        text = uromanize(text, uroman_pl)

    tokenizer = tokenizers[language]
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    if language != "Korean":
        text = tokenizer.batch_decode(input_ids)[0]

    model = models[language]
    with torch.no_grad():
        outputs = model(input_ids)

    speech = outputs.audio[0]
    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech), text


title = "MMS-TTS speech synthesis"

description = """
Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide
speech technology across a diverse range of languages. The MMS-TTS project contains a collection of
over 1000 text-to-speech (TTS) models.

This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS
model, this code can also be used to run VITS checkpoints.
For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits).

As the model performs random sampling, the generated speech is slightly different each time.
The voice may also vary between runs, or sometimes even in the same sentence.
(Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints
are not conditioned on a speaker ID.)
"""

article = """
<div style='margin:20px auto;'>

<p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> |
<a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> |
<a href="https://huggingface.co/facebook/mms-tts">original weights</a> |
<a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a>
</p>

<pre>
@article{pratap2023mms,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  journal={arXiv},
  year={2023}
}
</pre>

</div>
"""

examples = [
    ["It is not in the stars to hold our destiny but in ourselves.", "English"],
    ["The octopus and Oliver went to the opera in October.", "English"],
    ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"],
    ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"],
    ["A synonym for cinnamon is a cinnamon synonym.", "English"],
    ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"],

    ["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"],
    ["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"],

    ["안녕 세상, 날씨는 아름다워", "Korean"],  # Hello world, the weather is beautiful (Google Translate)
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Input Text"),
        gr.Radio(label="Language", choices=[
            "English",
            "German",
            "Korean",
        ],
        value="English"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Processed text"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()