import gradio as gr import scipy from transformers import VitsModel, AutoTokenizer import torch model = VitsModel.from_pretrained("facebook/mms-tts-crh") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-crh") def main(): with gr.Blocks() as demo: gr.Markdown( """

Balacoon🦝 Text-to-Speech

1. Write an utterance to generate, 2. Select the model to synthesize with 3. Select speaker 4. Hit "Generate" and listen to the result! You can learn more about models available [here](https://huggingface.co/balacoon/tts). Visit [Balacoon website](https://balacoon.com/) for more info. """ ) with gr.Row(variant="panel"): text = gr.Textbox(label="Text", placeholder="Type something here...") with gr.Row(variant="panel"): generate = gr.Button("Generate") with gr.Row(variant="panel"): audio = gr.Audio() def synthesize_audio(text_str: str): """ gets utterance to synthesize from `text` Textbox and speaker name from `speaker` dropdown list. speaker name might be empty for single-speaker models. Synthesizes the waveform and updates `audio` with it. """ global tokenizer, model inputs = tokenizer(text_str, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform return gr.Audio.update(value=(output)) generate.click(synthesize_audio, inputs=[text], outputs=audio) demo.queue(concurrency_count=1).launch() main()