import gradio as gr
import numpy as np
import torch
import os
import re
import tempfile

from transformers import VitsModel, VitsTokenizer


models = {
    "English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"),
    "German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"),
    "Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"),
}

tokenizers = {
    "English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"),
    "German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"),
    "Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"),
}


# For certain checkpoints, the text needs to be romanized.
# MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman
# This needs to be installed in the folder "uroman"
def uromanize(text, uroman_pl):
    iso = "xxx"
    with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
        with open(tf.name, "w") as f:
            f.write("\n".join([text]))
        cmd = f"perl " + uroman_pl
        cmd += f" -l {iso} "
        cmd += f" < {tf.name} > {tf2.name}"
        os.system(cmd)
        outtexts = []
        with open(tf2.name) as f:
            for line in f:
                line = re.sub(r"\s+", " ", line).strip()
                outtexts.append(line)
        outtext = outtexts[0]
    return outtext


def predict(text, language=None):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))

    if language == "Korean":
        uroman_pl = os.path.join("uroman", "bin", "uroman.pl")
        text = uromanize(text, uroman_pl)

    tokenizer = tokenizers[language]
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    if language != "Korean":
        text = tokenizer.batch_decode(input_ids)[0]

    model = models[language]
    with torch.no_grad():
        outputs = model(input_ids)

    speech = outputs.audio[0]
    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech), text


title = "MMS-TTS speech synthesis"

description = """
Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide
speech technology across a diverse range of languages. The MMS-TTS project contains a collection of
over 1000 text-to-speech (TTS) models.

This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS
model, this code can also be used to run VITS checkpoints.
For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits).

As the model performs random sampling, the generated speech is slightly different each time.
The voice may also vary between runs, or sometimes even in the same sentence.
(Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints
are not conditioned on a speaker ID.)
"""

article = """
<div style='margin:20px auto;'>

<p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> |
<a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> |
<a href="https://huggingface.co/facebook/mms-tts">original weights</a> |
<a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a>
</p>

<pre>
@article{pratap2023mms,
  title={Scaling Speech Technology to 1,000+ Languages},
  author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
  journal={arXiv},
  year={2023}
}
</pre>

</div>
"""

examples = [
    ["It is not in the stars to hold our destiny but in ourselves.", "English"],
    ["The octopus and Oliver went to the opera in October.", "English"],
    ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"],
    ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"],
    ["A synonym for cinnamon is a cinnamon synonym.", "English"],
    ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"],

    ["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"],
    ["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"],

    ["안녕 세상, 날씨는 아름다워", "Korean"],  # Hello world, the weather is beautiful (Google Translate)
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Input Text"),
        gr.Radio(label="Language", choices=[
            "English",
            "German",
            "Korean",
        ],
        value="English"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Processed text"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()