import gradio as gr
import scipy
from transformers import VitsModel, AutoTokenizer
import torch
model = VitsModel.from_pretrained("facebook/mms-tts-crh")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-crh")
def main():
with gr.Blocks() as demo:
gr.Markdown(
"""
Balacoon🦝 Text-to-Speech
1. Write an utterance to generate,
2. Select the model to synthesize with
3. Select speaker
4. Hit "Generate" and listen to the result!
You can learn more about models available
[here](https://huggingface.co/balacoon/tts).
Visit [Balacoon website](https://balacoon.com/) for more info.
"""
)
with gr.Row(variant="panel"):
text = gr.Textbox(label="Text", placeholder="Type something here...")
with gr.Row(variant="panel"):
generate = gr.Button("Generate")
with gr.Row(variant="panel"):
audio = gr.Audio()
def synthesize_audio(text_str: str):
"""
gets utterance to synthesize from `text` Textbox
and speaker name from `speaker` dropdown list.
speaker name might be empty for single-speaker models.
Synthesizes the waveform and updates `audio` with it.
"""
global tokenizer, model
inputs = tokenizer(text_str, return_tensors="pt")
with torch.no_grad():
output = model(**inputs).waveform
return gr.Audio.update(value=(output))
generate.click(synthesize_audio, inputs=[text], outputs=audio)
demo.queue(concurrency_count=1).launch()
main()