File size: 9,490 Bytes
92df4f5 40b17fc fc52d83 92df4f5 158b03a 2dd2041 158b03a 2dd2041 fc52d83 92df4f5 2dd2041 92df4f5 2dd2041 92df4f5 2dd2041 35c9c3f 2dd2041 92df4f5 2dd2041 92df4f5 2dd2041 35c9c3f 2dd2041 92df4f5 bea1338 40b17fc 2dd2041 35c9c3f 78e7ff4 2dd2041 35c9c3f 2dd2041 bbb1375 1b8b2e7 92df4f5 d3127d4 92df4f5 1b8b2e7 bea1338 1b8b2e7 92df4f5 bbb1375 2dd2041 92df4f5 2dd2041 92df4f5 6b0bcdf 92df4f5 158b03a 92df4f5 158b03a 92df4f5 1b8b2e7 158b03a 92df4f5 6b0bcdf 92df4f5 40b17fc 92df4f5 ba1a8f9 92df4f5 bbb1375 92df4f5 211b582 35c9c3f bbb1375 92df4f5 5b51c67 fc52d83 211b582 92df4f5 2dd2041 92df4f5 62f951d 92df4f5 2dd2041 6b0bcdf 1b8b2e7 40b17fc 92df4f5 40b17fc 92df4f5 fc52d83 92df4f5 2dd2041 92df4f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 |
import numpy as np
import onnxruntime
from text import text_to_sequence, sequence_to_text
import torch
import gradio as gr
import soundfile as sf
import tempfile
import yaml
import json
import os
from time import perf_counter
import random
DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="quim")
DEFAULT_ACCENT= os.environ.get("DEFAULT_ACCENT", default="balear")
def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1)
result[1::2] = lst
return result
def process_text(i: int, text: str, device: torch.device, cleaner:str):
print(f"[{i}] - Input text: {text}")
x = torch.tensor(
intersperse(text_to_sequence(text, [cleaner]), 0),
dtype=torch.long,
device=device,
)[None]
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
x_phones = sequence_to_text(x.squeeze(0).tolist())
print(x_phones)
return x.numpy(), x_lengths.numpy()
# paths
MODEL_PATH_MATCHA_MEL_BAL="matcha_multispeaker_cat_bal_opset_15_10_steps.onnx"
MODEL_PATH_MATCHA_MEL_CAT="matcha_multispeaker_cat_cen_opset_15_10_steps.onnx"
MODEL_PATH_MATCHA_MEL_OCC="matcha_multispeaker_cat_occ_opset_15_10_steps.onnx"
MODEL_PATH_MATCHA_MEL_VAL="matcha_multispeaker_cat_val_opset_15_10_steps.onnx"
MODEL_PATH_VOCOS="mel_spec_22khz_cat.onnx"
CONFIG_PATH="config.yaml"
SPEAKER_ID_DICT="spk_to_id_2.json"
# Load models
sess_options = onnxruntime.SessionOptions()
model_matcha_mel_bal = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_BAL), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_matcha_mel_cat = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_CAT), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_matcha_mel_occ = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_OCC), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_matcha_mel_val = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_VAL), sess_options=sess_options, providers=["CPUExecutionProvider"])
model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
accents = [e for e in speaker_id_dict.keys()]
models={"balear":model_matcha_mel_bal,
"nord-occidental": model_matcha_mel_occ,
"valencia": model_matcha_mel_val,
"central": model_matcha_mel_cat}
cleaners={"balear": "catalan_balear_cleaners",
"nord-occidental": "catalan_occidental_cleaners",
"valencia": "catalan_valencia_cleaners",
"central": "catalan_cleaners"}
speakers = [sp for sp in speaker_id_dict[DEFAULT_ACCENT].keys()]
def vocos_inference(mel,denoise):
with open(CONFIG_PATH, "r") as f:
config = yaml.safe_load(f)
params = config["feature_extractor"]["init_args"]
sample_rate = params["sample_rate"]
n_fft= params["n_fft"]
hop_length= params["hop_length"]
win_length = n_fft
# ONNX inference
mag, x, y = model_vocos.run(
None,
{
"mels": mel
},
)
# complex spectrogram from vocos output
spectrogram = mag * (x + 1j * y)
window = torch.hann_window(win_length)
if denoise:
# Vocoder bias
mel_rand = torch.zeros_like(torch.tensor(mel))
mag_bias, x_bias, y_bias = model_vocos.run(
None,
{
"mels": mel_rand.float().numpy()
},
)
# complex spectrogram from vocos output
spectrogram_bias = mag_bias * (x_bias + 1j * y_bias)
# Denoising
spec = torch.view_as_real(torch.tensor(spectrogram))
# get magnitude of vocos spectrogram
mag_spec = torch.sqrt(spec.pow(2).sum(-1))
# get magnitude of bias spectrogram
spec_bias = torch.view_as_real(torch.tensor(spectrogram_bias))
mag_spec_bias = torch.sqrt(spec_bias.pow(2).sum(-1))
# substract
strength = 0.0025
mag_spec_denoised = mag_spec - mag_spec_bias * strength
mag_spec_denoised = torch.clamp(mag_spec_denoised, 0.0)
# return to complex spectrogram from magnitude
angle = torch.atan2(spec[..., -1], spec[..., 0] )
spectrogram = torch.complex(mag_spec_denoised * torch.cos(angle), mag_spec_denoised * torch.sin(angle))
# Inverse stft
pad = (win_length - hop_length) // 2
spectrogram = torch.tensor(spectrogram)
B, N, T = spectrogram.shape
print("Spectrogram synthesized shape", spectrogram.shape)
# Inverse FFT
ifft = torch.fft.irfft(spectrogram, n_fft, dim=1, norm="backward")
ifft = ifft * window[None, :, None]
# Overlap and Add
output_size = (T - 1) * hop_length + win_length
y = torch.nn.functional.fold(
ifft, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
)[:, 0, 0, pad:-pad]
# Window envelope
window_sq = window.square().expand(1, T, -1).transpose(1, 2)
window_envelope = torch.nn.functional.fold(
window_sq, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
).squeeze()[pad:-pad]
# Normalize
assert (window_envelope > 1e-11).all()
y = y / window_envelope
return y
def tts(text:str, accent:str, spk_name:str, temperature:float, length_scale:float, denoise:bool):
spk_id = speaker_id_dict[accent][spk_name]
sid = np.array([int(spk_id)]) if spk_id is not None else None
text_matcha , text_lengths = process_text(0,text,"cpu",cleaner=cleaners[accent])
model_matcha_mel = models[accent]
# MATCHA VOCOS
inputs = {
"x": text_matcha,
"x_lengths": text_lengths,
"scales": np.array([temperature, length_scale], dtype=np.float32),
"spks": sid
}
mel_t0 = perf_counter()
# matcha mel inference
mel, mel_lengths = model_matcha_mel.run(None, inputs)
mel_infer_secs = perf_counter() - mel_t0
print("Matcha Mel inference time", mel_infer_secs)
vocos_t0 = perf_counter()
# vocos inference
wavs_vocos = vocos_inference(mel,denoise)
vocos_infer_secs = perf_counter() - vocos_t0
print("Vocos inference time", vocos_infer_secs)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
return fp_matcha_vocos.name
## GUI space
title = """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
> <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
Natural and efficient TTS in Catalan
</h1> </div>
</div>
"""
description = """
🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis
For vocoders we use [Vocos](https://huggingface.co/BSC-LT/vocos-mel-22khz-cat) trained in a catalan set of ~28 hours.
[Matcha](https://huggingface.co/BSC-LT/matcha-tts-cat-multispeaker) was trained using openslr69 and festcat datasets
"""
with open("about.md", "r", encoding="utf-8") as f:
about = f.read()
article = "Training and demo by The Language Technologies Unit from Barcelona Supercomputing Center."
def rs_change(accent):
rnd_idx = random.randint(0, 1)
return gr.Dropdown(choices=speaker_id_dict[accent], interactive=True,value=list(speaker_id_dict[accent].keys())[rnd_idx])
accent_dropdown = gr.Dropdown(
choices=accents,
label="Accent",
value=DEFAULT_ACCENT,
info=f"Models are trained on 4 accents"
)
speaker_dropdown = gr.Dropdown(
choices=speaker_id_dict[DEFAULT_ACCENT],
label="Speaker id",
value=DEFAULT_SPEAKER_ID,
info=f"Models are trained on 2 speakers. You can prompt the model using one of these speaker ids.",
interactive=True
)
matcha_inference = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(
value="m'ha costat molt desenvolupar una veu, i ara que la tinc no estaré en silenci.",
max_lines=1,
label="Input text",
),
accent_dropdown,
speaker_dropdown,
gr.Slider(
0.1,
2.0,
value=0.667,
step=0.01,
label="Temperature",
info=f"Temperature",
),
gr.Slider(
0.5,
2.0,
value=1.0,
step=0.01,
label="Length scale",
info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
),
gr.Checkbox(label="Denoise", info="Removes model bias from vocos", value=True),
],
outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath")]
)
about_article = gr.Markdown(about)
demo = gr.Blocks()
with demo:
gr.Markdown(title)
gr.Markdown(description)
gr.TabbedInterface([matcha_inference, about_article], ["Demo", "About"])
accent_dropdown.select(fn=rs_change, inputs=accent_dropdown, outputs=speaker_dropdown)
gr.Markdown(article)
demo.queue(max_size=10)
demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)
|