from transformers import VitsModel, AutoTokenizer import soundfile as sf import tempfile import gradio as gr import torch from datetime import datetime tts_model = VitsModel.from_pretrained("facebook/mms-tts-pol") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-pol") title = "Polish Text-To-Speech model based on facebook/mms-tts-pol" def tts(text: str, language: str): print(f"When: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')} :", text) inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = tts_model(**inputs).waveform.squeeze().numpy() sf.write('temp_file.wav', output, tts_model.config.sampling_rate) return 'temp_file.wav' with gr.Blocks() as blocks: gr.Markdown("

" + title + "

") with gr.Row():# equal_height=False with gr.Column():# variant="panel" textbox = gr.Textbox(label="Input", value = "Cześć, co chciałbyś abym Ci przeczytał?") with gr.Row():# mobile_collapse=False submit = gr.Button("Submit", variant="primary") audio = gr.Audio('temp_file.wav', label="Generated Audio (wav)", type='filepath', autoplay=False) submit.click(tts, textbox, audio) blocks.launch()