Balacoon🦝 Text-to-Speech

import gradio as gr
import scipy
from transformers import VitsModel, AutoTokenizer
import torch

model = VitsModel.from_pretrained("facebook/mms-tts-crh")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-crh")

def main():
       
    with gr.Blocks() as demo:
        gr.Markdown(
            """
            <h1 align="center">Balacoon🦝 Text-to-Speech</h1>
            1. Write an utterance to generate,
            2. Select the model to synthesize with
            3. Select speaker
            4. Hit "Generate" and listen to the result!
            You can learn more about models available
            [here](https://huggingface.co/balacoon/tts).
            Visit [Balacoon website](https://balacoon.com/) for more info.
            """
        )
        with gr.Row(variant="panel"):
            text = gr.Textbox(label="Text", placeholder="Type something here...")

        with gr.Row(variant="panel"):
            generate = gr.Button("Generate")
        with gr.Row(variant="panel"):
            audio = gr.Audio()

        
        def synthesize_audio(text_str: str):
            """
            gets utterance to synthesize from `text` Textbox
            and speaker name from `speaker` dropdown list.
            speaker name might be empty for single-speaker models.
            Synthesizes the waveform and updates `audio` with it.
            """
            global tokenizer, model

            inputs = tokenizer(text_str, return_tensors="pt")

            with torch.no_grad():
                output = model(**inputs).waveform

            return gr.Audio.update(value=(output))

        generate.click(synthesize_audio, inputs=[text], outputs=audio)

    demo.queue(concurrency_count=1).launch()


main()