Spaces:
Paused
Paused
File size: 3,758 Bytes
96094ed 930b057 e9c1685 930b057 e9c1685 930b057 f251112 e9c1685 930b057 f251112 96094ed e9c1685 930b057 96094ed e9c1685 96094ed e9c1685 96094ed e9c1685 930b057 e9c1685 e4f5d4b 930b057 f251112 e9c1685 f251112 96094ed 6747ea1 930b057 e9c1685 930b057 e9c1685 f251112 e9c1685 6747ea1 96094ed 930b057 96094ed 930b057 96094ed 6747ea1 f251112 6747ea1 96094ed e9c1685 f251112 96094ed 5b1172f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import torch
from transformers import pipeline
import numpy as np
import gradio as gr
def _grab_best_device(use_gpu=True):
if torch.cuda.device_count() > 0 and use_gpu:
device = "cuda"
else:
device = "cpu"
return device
device = _grab_best_device()
default_model_per_language = {
"english": "kakao-enterprise/vits-ljs",
"spanish": "facebook/mms-tts-spa",
}
models_per_language = {
"english": [
("Irish Male Speaker", "ylacombe/vits_ljs_irish_male_monospeaker_2"),
("Welsh Female Speaker", "ylacombe/vits_ljs_welsh_female_monospeaker_2"),
("Welsh Male Speaker", "ylacombe/vits_ljs_welsh_male_monospeaker_2"),
("Scottish Female Speaker", "ylacombe/vits_ljs_scottish_female_monospeaker"),
],
"spanish": [
("Male Chilean Speaker", "ylacombe/mms-spa-finetuned-chilean-monospeaker"),
("Female Argentinian Speaker", "ylacombe/mms-spa-finetuned-argentinian-monospeaker"),
("Male Colombian Speaker", "ylacombe/mms-spa-finetuned-colombian-monospeaker"),
],
}
pipe_dict = {
"pipe": [pipeline("text-to-speech", model=l[1], device=0) for l in models_per_language["english"]],
"original_pipe": pipeline("text-to-speech", model=default_model_per_language["english"], device=0),
"language": "english",
}
title = "# VITS"
description = """
TODO
"""
max_speakers = 15
# Inference
def generate_audio(text, language):
if pipe_dict["language"] != language:
gr.Warning(f"Language has changed - loading corresponding models: {default_model_per_language[language]}")
pipe_dict["language"] = language
pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=0)
pipe_dict["pipe"] = [pipeline("text-to-speech", model=l[1], device=0) for l in models_per_language["english"]]
num_speakers = pipe_dict["pipe"].model.config.num_speakers
out = []
# first generate original model result
output = pipe_dict["original_pipe"](text)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Prediction from the original checkpoint {default_model_per_language[language]}", show_label=True,
visible=True)
out.append(output)
for i in range(min(len(pipe_dict["pipe"]), max_speakers - 1)):
output = pipe_dict["pipe"][i](text)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Finetuned {models_per_language[language][i][0]}", show_label=True,
visible=True)
out.append(output)
out.extend([gr.Audio(visible=False)]*(max_speakers-(len(out))))
return out
# Gradio blocks demo
with gr.Blocks() as demo_blocks:
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
with gr.Column():
inp_text = gr.Textbox(label="Input Text", info="What would you like VITS to synthesise?")
btn = gr.Button("Generate Audio!")
language = gr.Dropdown(
default_model_per_language.keys(),
value = "english",
label = "language",
info = "Language that you want to test"
)
with gr.Column():
outputs = []
for i in range(max_speakers):
out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
outputs.append(out_audio)
btn.click(generate_audio, [inp_text, language], outputs)
demo_blocks.queue().launch() |