Spaces:

Matthijs
/

mms-tts-demo

Runtime error

mms-tts-demo / app.py

Matthijs Hollemans

here we go!

f0839e8 12 months ago

No virus

5.02 kB

	import gradio as gr
	import numpy as np
	import torch
	import os
	import re
	import tempfile

	from transformers import VitsModel, VitsTokenizer


	models = {
	"English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"),
	"German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"),
	"Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"),
	}

	tokenizers = {
	"English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"),
	"German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"),
	"Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"),
	}


	# For certain checkpoints, the text needs to be romanized.
	# MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman
	# This needs to be installed in the folder "uroman"
	def uromanize(text, uroman_pl):
	iso = "xxx"
	with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
	with open(tf.name, "w") as f:
	f.write("\n".join([text]))
	cmd = f"perl " + uroman_pl
	cmd += f" -l {iso} "
	cmd += f" < {tf.name} > {tf2.name}"
	os.system(cmd)
	outtexts = []
	with open(tf2.name) as f:
	for line in f:
	line = re.sub(r"\s+", " ", line).strip()
	outtexts.append(line)
	outtext = outtexts[0]
	return outtext


	def predict(text, language=None):
	if len(text.strip()) == 0:
	return (16000, np.zeros(0).astype(np.int16))

	if language == "Korean":
	uroman_pl = os.path.join("uroman", "bin", "uroman.pl")
	text = uromanize(text, uroman_pl)

	tokenizer = tokenizers[language]
	inputs = tokenizer(text, return_tensors="pt")
	input_ids = inputs["input_ids"]

	if language != "Korean":
	text = tokenizer.batch_decode(input_ids)[0]

	model = models[language]
	with torch.no_grad():
	outputs = model(input_ids)

	speech = outputs.audio[0]
	speech = (speech.numpy() * 32767).astype(np.int16)
	return (16000, speech), text


	title = "MMS-TTS speech synthesis"

	description = """
	Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide
	speech technology across a diverse range of languages. The MMS-TTS project contains a collection of
	over 1000 text-to-speech (TTS) models.

	This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS
	model, this code can also be used to run VITS checkpoints.
	For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits).

	As the model performs random sampling, the generated speech is slightly different each time.
	The voice may also vary between runs, or sometimes even in the same sentence.
	(Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints
	are not conditioned on a speaker ID.)
	"""

	article = """
	<div style='margin:20px auto;'>

	<p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> \|
	<a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> \|
	<a href="https://huggingface.co/facebook/mms-tts">original weights</a> \|
	<a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a>
	</p>

	<pre>
	@article{pratap2023mms,
	title={Scaling Speech Technology to 1,000+ Languages},
	author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
	journal={arXiv},
	year={2023}
	}
	</pre>

	</div>
	"""

	examples = [
	["It is not in the stars to hold our destiny but in ourselves.", "English"],
	["The octopus and Oliver went to the opera in October.", "English"],
	["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"],
	["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"],
	["A synonym for cinnamon is a cinnamon synonym.", "English"],
	["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"],

	["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"],
	["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"],

	["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate)
	]

	gr.Interface(
	fn=predict,
	inputs=[
	gr.Text(label="Input Text"),
	gr.Radio(label="Language", choices=[
	"English",
	"German",
	"Korean",
	],
	value="English"),
	],
	outputs=[
	gr.Audio(label="Generated Speech", type="numpy"),
	gr.Text(label="Processed text"),
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	).launch()