import gradio as gr import numpy as np import torch import os import re import tempfile from transformers import VitsModel, VitsTokenizer models = { "English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"), "German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"), "Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"), } tokenizers = { "English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"), "German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"), "Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"), } # For certain checkpoints, the text needs to be romanized. # MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman # This needs to be installed in the folder "uroman" def uromanize(text, uroman_pl): iso = "xxx" with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2: with open(tf.name, "w") as f: f.write("\n".join([text])) cmd = f"perl " + uroman_pl cmd += f" -l {iso} " cmd += f" < {tf.name} > {tf2.name}" os.system(cmd) outtexts = [] with open(tf2.name) as f: for line in f: line = re.sub(r"\s+", " ", line).strip() outtexts.append(line) outtext = outtexts[0] return outtext def predict(text, language=None): if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) if language == "Korean": uroman_pl = os.path.join("uroman", "bin", "uroman.pl") text = uromanize(text, uroman_pl) tokenizer = tokenizers[language] inputs = tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"] if language != "Korean": text = tokenizer.batch_decode(input_ids)[0] model = models[language] with torch.no_grad(): outputs = model(input_ids) speech = outputs.audio[0] speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech), text title = "MMS-TTS speech synthesis" description = """ Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide speech technology across a diverse range of languages. The MMS-TTS project contains a collection of over 1000 text-to-speech (TTS) models. This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS model, this code can also be used to run VITS checkpoints. For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits). As the model performs random sampling, the generated speech is slightly different each time. The voice may also vary between runs, or sometimes even in the same sentence. (Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints are not conditioned on a speaker ID.) """ article = """

References: MMS paper | blog post | original weights | original MMS space

@article{pratap2023mms,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  journal={arXiv},
  year={2023}
}
""" examples = [ ["It is not in the stars to hold our destiny but in ourselves.", "English"], ["The octopus and Oliver went to the opera in October.", "English"], ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"], ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"], ["A synonym for cinnamon is a cinnamon synonym.", "English"], ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"], ["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"], ["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"], ["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate) ] gr.Interface( fn=predict, inputs=[ gr.Text(label="Input Text"), gr.Radio(label="Language", choices=[ "English", "German", "Korean", ], value="English"), ], outputs=[ gr.Audio(label="Generated Speech", type="numpy"), gr.Text(label="Processed text"), ], title=title, description=description, article=article, examples=examples, ).launch()