import torch from transformers import pipeline import numpy as np import gradio as gr def _grab_best_device(use_gpu=True): if torch.cuda.device_count() > 0 and use_gpu: device = "cuda" else: device = "cpu" return device device = _grab_best_device() HUB_PATH = "ylacombe/vits_vctk_welsh_male" pipe = pipeline("text-to-speech", model=HUB_PATH, device=0) title = "# 🐶 VITS" description = """ """ num_speakers = pipe.model.config.num_speakers # Inference def generate_audio(text, spkr_id): forward_params = {"speaker_id": spkr_id} output = pipe(text, forward_params=forward_params) return (output["sampling_rate"], output["audio"].squeeze()) # Gradio blocks demo with gr.Blocks() as demo_blocks: gr.Markdown(title) gr.Markdown(description) with gr.Row(): with gr.Column(): inp_text = gr.Textbox(label="Input Text", info="What would you like bark to synthesise?") spkr = gr.Dropdown( [i for i in range(num_speakers)], value=None, label="Speaker ID", info="Default: Unconditional Generation" ) btn = gr.Button("Generate Audio!") with gr.Column(): out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="Generated Audio", show_label=True) btn.click(generate_audio, [inp_text, spkr], [out_audio_vocos]) demo_blocks.launch()