import torch from transformers import pipeline import numpy as np import gradio as gr def _grab_best_device(use_gpu=True): if torch.cuda.device_count() > 0 and use_gpu: device = "cuda" else: device = "cpu" return device device = _grab_best_device() HUB_PATH = "ylacombe/vits_vctk_welsh_male" pipe = pipeline("text-to-speech", model=HUB_PATH, device=0) title = "# 🐶 VITS" description = """ """ num_speakers = pipe.model.config.num_speakers # Inference def generate_audio(text): out = [] for i in range(num_speakers): forward_params = {"speaker_id": i} output = pipe(text, forward_params=forward_params) out.append((output["sampling_rate"], output["audio"].squeeze())) return out # Gradio blocks demo with gr.Blocks() as demo_blocks: gr.Markdown(title) gr.Markdown(description) with gr.Row(): with gr.Column(): inp_text = gr.Textbox(label="Input Text", info="What would you like bark to synthesise?") btn = gr.Button("Generate Audio!") with gr.Column(): outputs = [] for i in range(num_speakers): out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio {i}", show_label=True) outputs.append(out_audio) btn.click(generate_audio, [inp_text], [outputs]) demo_blocks.launch()