Spaces:
Runtime error
Runtime error
kevinwang676
commited on
Commit
•
1f74c86
1
Parent(s):
dd6f15d
Upload 6 files
Browse files- commons.py +171 -0
- mel_processing.py +112 -0
- models.py +351 -0
- modules.py +342 -0
- tts_voice.py +290 -0
- utils.py +305 -0
commons.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
|
7 |
+
|
8 |
+
def init_weights(m, mean=0.0, std=0.01):
|
9 |
+
classname = m.__class__.__name__
|
10 |
+
if classname.find("Conv") != -1:
|
11 |
+
m.weight.data.normal_(mean, std)
|
12 |
+
|
13 |
+
|
14 |
+
def get_padding(kernel_size, dilation=1):
|
15 |
+
return int((kernel_size*dilation - dilation)/2)
|
16 |
+
|
17 |
+
|
18 |
+
def convert_pad_shape(pad_shape):
|
19 |
+
l = pad_shape[::-1]
|
20 |
+
pad_shape = [item for sublist in l for item in sublist]
|
21 |
+
return pad_shape
|
22 |
+
|
23 |
+
|
24 |
+
def intersperse(lst, item):
|
25 |
+
result = [item] * (len(lst) * 2 + 1)
|
26 |
+
result[1::2] = lst
|
27 |
+
return result
|
28 |
+
|
29 |
+
|
30 |
+
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
31 |
+
"""KL(P||Q)"""
|
32 |
+
kl = (logs_q - logs_p) - 0.5
|
33 |
+
kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
|
34 |
+
return kl
|
35 |
+
|
36 |
+
|
37 |
+
def rand_gumbel(shape):
|
38 |
+
"""Sample from the Gumbel distribution, protect from overflows."""
|
39 |
+
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
40 |
+
return -torch.log(-torch.log(uniform_samples))
|
41 |
+
|
42 |
+
|
43 |
+
def rand_gumbel_like(x):
|
44 |
+
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
45 |
+
return g
|
46 |
+
|
47 |
+
|
48 |
+
def slice_segments(x, ids_str, segment_size=4):
|
49 |
+
ret = torch.zeros_like(x[:, :, :segment_size])
|
50 |
+
for i in range(x.size(0)):
|
51 |
+
idx_str = ids_str[i]
|
52 |
+
idx_end = idx_str + segment_size
|
53 |
+
ret[i] = x[i, :, idx_str:idx_end]
|
54 |
+
return ret
|
55 |
+
|
56 |
+
|
57 |
+
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
58 |
+
b, d, t = x.size()
|
59 |
+
if x_lengths is None:
|
60 |
+
x_lengths = t
|
61 |
+
ids_str_max = x_lengths - segment_size + 1
|
62 |
+
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
63 |
+
ret = slice_segments(x, ids_str, segment_size)
|
64 |
+
return ret, ids_str
|
65 |
+
|
66 |
+
|
67 |
+
def rand_spec_segments(x, x_lengths=None, segment_size=4):
|
68 |
+
b, d, t = x.size()
|
69 |
+
if x_lengths is None:
|
70 |
+
x_lengths = t
|
71 |
+
ids_str_max = x_lengths - segment_size
|
72 |
+
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
73 |
+
ret = slice_segments(x, ids_str, segment_size)
|
74 |
+
return ret, ids_str
|
75 |
+
|
76 |
+
|
77 |
+
def get_timing_signal_1d(
|
78 |
+
length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
79 |
+
position = torch.arange(length, dtype=torch.float)
|
80 |
+
num_timescales = channels // 2
|
81 |
+
log_timescale_increment = (
|
82 |
+
math.log(float(max_timescale) / float(min_timescale)) /
|
83 |
+
(num_timescales - 1))
|
84 |
+
inv_timescales = min_timescale * torch.exp(
|
85 |
+
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
|
86 |
+
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
87 |
+
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
88 |
+
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
89 |
+
signal = signal.view(1, channels, length)
|
90 |
+
return signal
|
91 |
+
|
92 |
+
|
93 |
+
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
94 |
+
b, channels, length = x.size()
|
95 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
96 |
+
return x + signal.to(dtype=x.dtype, device=x.device)
|
97 |
+
|
98 |
+
|
99 |
+
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
100 |
+
b, channels, length = x.size()
|
101 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
102 |
+
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
103 |
+
|
104 |
+
|
105 |
+
def subsequent_mask(length):
|
106 |
+
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
107 |
+
return mask
|
108 |
+
|
109 |
+
|
110 |
+
@torch.jit.script
|
111 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
112 |
+
n_channels_int = n_channels[0]
|
113 |
+
in_act = input_a + input_b
|
114 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
115 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
116 |
+
acts = t_act * s_act
|
117 |
+
return acts
|
118 |
+
|
119 |
+
|
120 |
+
def convert_pad_shape(pad_shape):
|
121 |
+
l = pad_shape[::-1]
|
122 |
+
pad_shape = [item for sublist in l for item in sublist]
|
123 |
+
return pad_shape
|
124 |
+
|
125 |
+
|
126 |
+
def shift_1d(x):
|
127 |
+
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
128 |
+
return x
|
129 |
+
|
130 |
+
|
131 |
+
def sequence_mask(length, max_length=None):
|
132 |
+
if max_length is None:
|
133 |
+
max_length = length.max()
|
134 |
+
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
135 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
136 |
+
|
137 |
+
|
138 |
+
def generate_path(duration, mask):
|
139 |
+
"""
|
140 |
+
duration: [b, 1, t_x]
|
141 |
+
mask: [b, 1, t_y, t_x]
|
142 |
+
"""
|
143 |
+
device = duration.device
|
144 |
+
|
145 |
+
b, _, t_y, t_x = mask.shape
|
146 |
+
cum_duration = torch.cumsum(duration, -1)
|
147 |
+
|
148 |
+
cum_duration_flat = cum_duration.view(b * t_x)
|
149 |
+
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
150 |
+
path = path.view(b, t_x, t_y)
|
151 |
+
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
152 |
+
path = path.unsqueeze(1).transpose(2,3) * mask
|
153 |
+
return path
|
154 |
+
|
155 |
+
|
156 |
+
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
157 |
+
if isinstance(parameters, torch.Tensor):
|
158 |
+
parameters = [parameters]
|
159 |
+
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
160 |
+
norm_type = float(norm_type)
|
161 |
+
if clip_value is not None:
|
162 |
+
clip_value = float(clip_value)
|
163 |
+
|
164 |
+
total_norm = 0
|
165 |
+
for p in parameters:
|
166 |
+
param_norm = p.grad.data.norm(norm_type)
|
167 |
+
total_norm += param_norm.item() ** norm_type
|
168 |
+
if clip_value is not None:
|
169 |
+
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
170 |
+
total_norm = total_norm ** (1. / norm_type)
|
171 |
+
return total_norm
|
mel_processing.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import torch.utils.data
|
8 |
+
import numpy as np
|
9 |
+
import librosa
|
10 |
+
import librosa.util as librosa_util
|
11 |
+
from librosa.util import normalize, pad_center, tiny
|
12 |
+
from scipy.signal import get_window
|
13 |
+
from scipy.io.wavfile import read
|
14 |
+
from librosa.filters import mel as librosa_mel_fn
|
15 |
+
|
16 |
+
MAX_WAV_VALUE = 32768.0
|
17 |
+
|
18 |
+
|
19 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
20 |
+
"""
|
21 |
+
PARAMS
|
22 |
+
------
|
23 |
+
C: compression factor
|
24 |
+
"""
|
25 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
26 |
+
|
27 |
+
|
28 |
+
def dynamic_range_decompression_torch(x, C=1):
|
29 |
+
"""
|
30 |
+
PARAMS
|
31 |
+
------
|
32 |
+
C: compression factor used to compress
|
33 |
+
"""
|
34 |
+
return torch.exp(x) / C
|
35 |
+
|
36 |
+
|
37 |
+
def spectral_normalize_torch(magnitudes):
|
38 |
+
output = dynamic_range_compression_torch(magnitudes)
|
39 |
+
return output
|
40 |
+
|
41 |
+
|
42 |
+
def spectral_de_normalize_torch(magnitudes):
|
43 |
+
output = dynamic_range_decompression_torch(magnitudes)
|
44 |
+
return output
|
45 |
+
|
46 |
+
|
47 |
+
mel_basis = {}
|
48 |
+
hann_window = {}
|
49 |
+
|
50 |
+
|
51 |
+
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
52 |
+
if torch.min(y) < -1.:
|
53 |
+
print('min value is ', torch.min(y))
|
54 |
+
if torch.max(y) > 1.:
|
55 |
+
print('max value is ', torch.max(y))
|
56 |
+
|
57 |
+
global hann_window
|
58 |
+
dtype_device = str(y.dtype) + '_' + str(y.device)
|
59 |
+
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
60 |
+
if wnsize_dtype_device not in hann_window:
|
61 |
+
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
62 |
+
|
63 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
64 |
+
y = y.squeeze(1)
|
65 |
+
|
66 |
+
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
67 |
+
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
68 |
+
|
69 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
70 |
+
return spec
|
71 |
+
|
72 |
+
|
73 |
+
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
74 |
+
global mel_basis
|
75 |
+
dtype_device = str(spec.dtype) + '_' + str(spec.device)
|
76 |
+
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
77 |
+
if fmax_dtype_device not in mel_basis:
|
78 |
+
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
|
79 |
+
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
|
80 |
+
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
81 |
+
spec = spectral_normalize_torch(spec)
|
82 |
+
return spec
|
83 |
+
|
84 |
+
|
85 |
+
def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
86 |
+
if torch.min(y) < -1.:
|
87 |
+
print('min value is ', torch.min(y))
|
88 |
+
if torch.max(y) > 1.:
|
89 |
+
print('max value is ', torch.max(y))
|
90 |
+
|
91 |
+
global mel_basis, hann_window
|
92 |
+
dtype_device = str(y.dtype) + '_' + str(y.device)
|
93 |
+
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
94 |
+
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
95 |
+
if fmax_dtype_device not in mel_basis:
|
96 |
+
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
|
97 |
+
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
|
98 |
+
if wnsize_dtype_device not in hann_window:
|
99 |
+
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
100 |
+
|
101 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
102 |
+
y = y.squeeze(1)
|
103 |
+
|
104 |
+
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
105 |
+
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
106 |
+
|
107 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
108 |
+
|
109 |
+
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
110 |
+
spec = spectral_normalize_torch(spec)
|
111 |
+
|
112 |
+
return spec
|
models.py
ADDED
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
|
7 |
+
import commons
|
8 |
+
import modules
|
9 |
+
|
10 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
+
from commons import init_weights, get_padding
|
13 |
+
|
14 |
+
|
15 |
+
class ResidualCouplingBlock(nn.Module):
|
16 |
+
def __init__(self,
|
17 |
+
channels,
|
18 |
+
hidden_channels,
|
19 |
+
kernel_size,
|
20 |
+
dilation_rate,
|
21 |
+
n_layers,
|
22 |
+
n_flows=4,
|
23 |
+
gin_channels=0):
|
24 |
+
super().__init__()
|
25 |
+
self.channels = channels
|
26 |
+
self.hidden_channels = hidden_channels
|
27 |
+
self.kernel_size = kernel_size
|
28 |
+
self.dilation_rate = dilation_rate
|
29 |
+
self.n_layers = n_layers
|
30 |
+
self.n_flows = n_flows
|
31 |
+
self.gin_channels = gin_channels
|
32 |
+
|
33 |
+
self.flows = nn.ModuleList()
|
34 |
+
for i in range(n_flows):
|
35 |
+
self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
|
36 |
+
self.flows.append(modules.Flip())
|
37 |
+
|
38 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
39 |
+
if not reverse:
|
40 |
+
for flow in self.flows:
|
41 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
42 |
+
else:
|
43 |
+
for flow in reversed(self.flows):
|
44 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
45 |
+
return x
|
46 |
+
|
47 |
+
|
48 |
+
class Encoder(nn.Module):
|
49 |
+
def __init__(self,
|
50 |
+
in_channels,
|
51 |
+
out_channels,
|
52 |
+
hidden_channels,
|
53 |
+
kernel_size,
|
54 |
+
dilation_rate,
|
55 |
+
n_layers,
|
56 |
+
gin_channels=0):
|
57 |
+
super().__init__()
|
58 |
+
self.in_channels = in_channels
|
59 |
+
self.out_channels = out_channels
|
60 |
+
self.hidden_channels = hidden_channels
|
61 |
+
self.kernel_size = kernel_size
|
62 |
+
self.dilation_rate = dilation_rate
|
63 |
+
self.n_layers = n_layers
|
64 |
+
self.gin_channels = gin_channels
|
65 |
+
|
66 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
67 |
+
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
68 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
69 |
+
|
70 |
+
def forward(self, x, x_lengths, g=None):
|
71 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
72 |
+
x = self.pre(x) * x_mask
|
73 |
+
x = self.enc(x, x_mask, g=g)
|
74 |
+
stats = self.proj(x) * x_mask
|
75 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
76 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
77 |
+
return z, m, logs, x_mask
|
78 |
+
|
79 |
+
|
80 |
+
class Generator(torch.nn.Module):
|
81 |
+
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
82 |
+
super(Generator, self).__init__()
|
83 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
84 |
+
self.num_upsamples = len(upsample_rates)
|
85 |
+
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
|
86 |
+
resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
|
87 |
+
|
88 |
+
self.ups = nn.ModuleList()
|
89 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
90 |
+
self.ups.append(weight_norm(
|
91 |
+
ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
|
92 |
+
k, u, padding=(k-u)//2)))
|
93 |
+
|
94 |
+
self.resblocks = nn.ModuleList()
|
95 |
+
for i in range(len(self.ups)):
|
96 |
+
ch = upsample_initial_channel//(2**(i+1))
|
97 |
+
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
98 |
+
self.resblocks.append(resblock(ch, k, d))
|
99 |
+
|
100 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
101 |
+
self.ups.apply(init_weights)
|
102 |
+
|
103 |
+
if gin_channels != 0:
|
104 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
105 |
+
|
106 |
+
def forward(self, x, g=None):
|
107 |
+
x = self.conv_pre(x)
|
108 |
+
if g is not None:
|
109 |
+
x = x + self.cond(g)
|
110 |
+
|
111 |
+
for i in range(self.num_upsamples):
|
112 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
113 |
+
x = self.ups[i](x)
|
114 |
+
xs = None
|
115 |
+
for j in range(self.num_kernels):
|
116 |
+
if xs is None:
|
117 |
+
xs = self.resblocks[i*self.num_kernels+j](x)
|
118 |
+
else:
|
119 |
+
xs += self.resblocks[i*self.num_kernels+j](x)
|
120 |
+
x = xs / self.num_kernels
|
121 |
+
x = F.leaky_relu(x)
|
122 |
+
x = self.conv_post(x)
|
123 |
+
x = torch.tanh(x)
|
124 |
+
|
125 |
+
return x
|
126 |
+
|
127 |
+
def remove_weight_norm(self):
|
128 |
+
print('Removing weight norm...')
|
129 |
+
for l in self.ups:
|
130 |
+
remove_weight_norm(l)
|
131 |
+
for l in self.resblocks:
|
132 |
+
l.remove_weight_norm()
|
133 |
+
|
134 |
+
|
135 |
+
class DiscriminatorP(torch.nn.Module):
|
136 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
137 |
+
super(DiscriminatorP, self).__init__()
|
138 |
+
self.period = period
|
139 |
+
self.use_spectral_norm = use_spectral_norm
|
140 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
141 |
+
self.convs = nn.ModuleList([
|
142 |
+
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
143 |
+
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
144 |
+
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
145 |
+
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
146 |
+
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
147 |
+
])
|
148 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
149 |
+
|
150 |
+
def forward(self, x):
|
151 |
+
fmap = []
|
152 |
+
|
153 |
+
# 1d to 2d
|
154 |
+
b, c, t = x.shape
|
155 |
+
if t % self.period != 0: # pad first
|
156 |
+
n_pad = self.period - (t % self.period)
|
157 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
158 |
+
t = t + n_pad
|
159 |
+
x = x.view(b, c, t // self.period, self.period)
|
160 |
+
|
161 |
+
for l in self.convs:
|
162 |
+
x = l(x)
|
163 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
164 |
+
fmap.append(x)
|
165 |
+
x = self.conv_post(x)
|
166 |
+
fmap.append(x)
|
167 |
+
x = torch.flatten(x, 1, -1)
|
168 |
+
|
169 |
+
return x, fmap
|
170 |
+
|
171 |
+
|
172 |
+
class DiscriminatorS(torch.nn.Module):
|
173 |
+
def __init__(self, use_spectral_norm=False):
|
174 |
+
super(DiscriminatorS, self).__init__()
|
175 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
176 |
+
self.convs = nn.ModuleList([
|
177 |
+
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
178 |
+
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
179 |
+
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
180 |
+
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
181 |
+
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
182 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
183 |
+
])
|
184 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
185 |
+
|
186 |
+
def forward(self, x):
|
187 |
+
fmap = []
|
188 |
+
|
189 |
+
for l in self.convs:
|
190 |
+
x = l(x)
|
191 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
192 |
+
fmap.append(x)
|
193 |
+
x = self.conv_post(x)
|
194 |
+
fmap.append(x)
|
195 |
+
x = torch.flatten(x, 1, -1)
|
196 |
+
|
197 |
+
return x, fmap
|
198 |
+
|
199 |
+
|
200 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
201 |
+
def __init__(self, use_spectral_norm=False):
|
202 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
203 |
+
periods = [2,3,5,7,11]
|
204 |
+
|
205 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
206 |
+
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
207 |
+
self.discriminators = nn.ModuleList(discs)
|
208 |
+
|
209 |
+
def forward(self, y, y_hat):
|
210 |
+
y_d_rs = []
|
211 |
+
y_d_gs = []
|
212 |
+
fmap_rs = []
|
213 |
+
fmap_gs = []
|
214 |
+
for i, d in enumerate(self.discriminators):
|
215 |
+
y_d_r, fmap_r = d(y)
|
216 |
+
y_d_g, fmap_g = d(y_hat)
|
217 |
+
y_d_rs.append(y_d_r)
|
218 |
+
y_d_gs.append(y_d_g)
|
219 |
+
fmap_rs.append(fmap_r)
|
220 |
+
fmap_gs.append(fmap_g)
|
221 |
+
|
222 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
223 |
+
|
224 |
+
|
225 |
+
class SpeakerEncoder(torch.nn.Module):
|
226 |
+
def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
|
227 |
+
super(SpeakerEncoder, self).__init__()
|
228 |
+
self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
|
229 |
+
self.linear = nn.Linear(model_hidden_size, model_embedding_size)
|
230 |
+
self.relu = nn.ReLU()
|
231 |
+
|
232 |
+
def forward(self, mels):
|
233 |
+
self.lstm.flatten_parameters()
|
234 |
+
_, (hidden, _) = self.lstm(mels)
|
235 |
+
embeds_raw = self.relu(self.linear(hidden[-1]))
|
236 |
+
return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
|
237 |
+
|
238 |
+
def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
|
239 |
+
mel_slices = []
|
240 |
+
for i in range(0, total_frames-partial_frames, partial_hop):
|
241 |
+
mel_range = torch.arange(i, i+partial_frames)
|
242 |
+
mel_slices.append(mel_range)
|
243 |
+
|
244 |
+
return mel_slices
|
245 |
+
|
246 |
+
def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
|
247 |
+
mel_len = mel.size(1)
|
248 |
+
last_mel = mel[:,-partial_frames:]
|
249 |
+
|
250 |
+
if mel_len > partial_frames:
|
251 |
+
mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
|
252 |
+
mels = list(mel[:,s] for s in mel_slices)
|
253 |
+
mels.append(last_mel)
|
254 |
+
mels = torch.stack(tuple(mels), 0).squeeze(1)
|
255 |
+
|
256 |
+
with torch.no_grad():
|
257 |
+
partial_embeds = self(mels)
|
258 |
+
embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
|
259 |
+
#embed = embed / torch.linalg.norm(embed, 2)
|
260 |
+
else:
|
261 |
+
with torch.no_grad():
|
262 |
+
embed = self(last_mel)
|
263 |
+
|
264 |
+
return embed
|
265 |
+
|
266 |
+
|
267 |
+
class SynthesizerTrn(nn.Module):
|
268 |
+
"""
|
269 |
+
Synthesizer for Training
|
270 |
+
"""
|
271 |
+
|
272 |
+
def __init__(self,
|
273 |
+
spec_channels,
|
274 |
+
segment_size,
|
275 |
+
inter_channels,
|
276 |
+
hidden_channels,
|
277 |
+
filter_channels,
|
278 |
+
n_heads,
|
279 |
+
n_layers,
|
280 |
+
kernel_size,
|
281 |
+
p_dropout,
|
282 |
+
resblock,
|
283 |
+
resblock_kernel_sizes,
|
284 |
+
resblock_dilation_sizes,
|
285 |
+
upsample_rates,
|
286 |
+
upsample_initial_channel,
|
287 |
+
upsample_kernel_sizes,
|
288 |
+
gin_channels,
|
289 |
+
ssl_dim,
|
290 |
+
use_spk,
|
291 |
+
**kwargs):
|
292 |
+
|
293 |
+
super().__init__()
|
294 |
+
self.spec_channels = spec_channels
|
295 |
+
self.inter_channels = inter_channels
|
296 |
+
self.hidden_channels = hidden_channels
|
297 |
+
self.filter_channels = filter_channels
|
298 |
+
self.n_heads = n_heads
|
299 |
+
self.n_layers = n_layers
|
300 |
+
self.kernel_size = kernel_size
|
301 |
+
self.p_dropout = p_dropout
|
302 |
+
self.resblock = resblock
|
303 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
304 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
305 |
+
self.upsample_rates = upsample_rates
|
306 |
+
self.upsample_initial_channel = upsample_initial_channel
|
307 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
308 |
+
self.segment_size = segment_size
|
309 |
+
self.gin_channels = gin_channels
|
310 |
+
self.ssl_dim = ssl_dim
|
311 |
+
self.use_spk = use_spk
|
312 |
+
|
313 |
+
self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
|
314 |
+
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
315 |
+
self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
316 |
+
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
317 |
+
|
318 |
+
if not self.use_spk:
|
319 |
+
self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
|
320 |
+
|
321 |
+
def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
|
322 |
+
if c_lengths == None:
|
323 |
+
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
|
324 |
+
if spec_lengths == None:
|
325 |
+
spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
|
326 |
+
|
327 |
+
if not self.use_spk:
|
328 |
+
g = self.enc_spk(mel.transpose(1,2))
|
329 |
+
g = g.unsqueeze(-1)
|
330 |
+
|
331 |
+
_, m_p, logs_p, _ = self.enc_p(c, c_lengths)
|
332 |
+
z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
|
333 |
+
z_p = self.flow(z, spec_mask, g=g)
|
334 |
+
|
335 |
+
z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
|
336 |
+
o = self.dec(z_slice, g=g)
|
337 |
+
|
338 |
+
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
339 |
+
|
340 |
+
def infer(self, c, g=None, mel=None, c_lengths=None):
|
341 |
+
if c_lengths == None:
|
342 |
+
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
|
343 |
+
if not self.use_spk:
|
344 |
+
g = self.enc_spk.embed_utterance(mel.transpose(1,2))
|
345 |
+
g = g.unsqueeze(-1)
|
346 |
+
|
347 |
+
z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
|
348 |
+
z = self.flow(z_p, c_mask, g=g, reverse=True)
|
349 |
+
o = self.dec(z * c_mask, g=g)
|
350 |
+
|
351 |
+
return o
|
modules.py
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import numpy as np
|
4 |
+
import scipy
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
from torch.nn import functional as F
|
8 |
+
|
9 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
10 |
+
from torch.nn.utils import weight_norm, remove_weight_norm
|
11 |
+
|
12 |
+
import commons
|
13 |
+
from commons import init_weights, get_padding
|
14 |
+
|
15 |
+
|
16 |
+
LRELU_SLOPE = 0.1
|
17 |
+
|
18 |
+
|
19 |
+
class LayerNorm(nn.Module):
|
20 |
+
def __init__(self, channels, eps=1e-5):
|
21 |
+
super().__init__()
|
22 |
+
self.channels = channels
|
23 |
+
self.eps = eps
|
24 |
+
|
25 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
26 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
27 |
+
|
28 |
+
def forward(self, x):
|
29 |
+
x = x.transpose(1, -1)
|
30 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
31 |
+
return x.transpose(1, -1)
|
32 |
+
|
33 |
+
|
34 |
+
class ConvReluNorm(nn.Module):
|
35 |
+
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
36 |
+
super().__init__()
|
37 |
+
self.in_channels = in_channels
|
38 |
+
self.hidden_channels = hidden_channels
|
39 |
+
self.out_channels = out_channels
|
40 |
+
self.kernel_size = kernel_size
|
41 |
+
self.n_layers = n_layers
|
42 |
+
self.p_dropout = p_dropout
|
43 |
+
assert n_layers > 1, "Number of layers should be larger than 0."
|
44 |
+
|
45 |
+
self.conv_layers = nn.ModuleList()
|
46 |
+
self.norm_layers = nn.ModuleList()
|
47 |
+
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
|
48 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
49 |
+
self.relu_drop = nn.Sequential(
|
50 |
+
nn.ReLU(),
|
51 |
+
nn.Dropout(p_dropout))
|
52 |
+
for _ in range(n_layers-1):
|
53 |
+
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
|
54 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
55 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
56 |
+
self.proj.weight.data.zero_()
|
57 |
+
self.proj.bias.data.zero_()
|
58 |
+
|
59 |
+
def forward(self, x, x_mask):
|
60 |
+
x_org = x
|
61 |
+
for i in range(self.n_layers):
|
62 |
+
x = self.conv_layers[i](x * x_mask)
|
63 |
+
x = self.norm_layers[i](x)
|
64 |
+
x = self.relu_drop(x)
|
65 |
+
x = x_org + self.proj(x)
|
66 |
+
return x * x_mask
|
67 |
+
|
68 |
+
|
69 |
+
class DDSConv(nn.Module):
|
70 |
+
"""
|
71 |
+
Dialted and Depth-Separable Convolution
|
72 |
+
"""
|
73 |
+
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
74 |
+
super().__init__()
|
75 |
+
self.channels = channels
|
76 |
+
self.kernel_size = kernel_size
|
77 |
+
self.n_layers = n_layers
|
78 |
+
self.p_dropout = p_dropout
|
79 |
+
|
80 |
+
self.drop = nn.Dropout(p_dropout)
|
81 |
+
self.convs_sep = nn.ModuleList()
|
82 |
+
self.convs_1x1 = nn.ModuleList()
|
83 |
+
self.norms_1 = nn.ModuleList()
|
84 |
+
self.norms_2 = nn.ModuleList()
|
85 |
+
for i in range(n_layers):
|
86 |
+
dilation = kernel_size ** i
|
87 |
+
padding = (kernel_size * dilation - dilation) // 2
|
88 |
+
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
|
89 |
+
groups=channels, dilation=dilation, padding=padding
|
90 |
+
))
|
91 |
+
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
92 |
+
self.norms_1.append(LayerNorm(channels))
|
93 |
+
self.norms_2.append(LayerNorm(channels))
|
94 |
+
|
95 |
+
def forward(self, x, x_mask, g=None):
|
96 |
+
if g is not None:
|
97 |
+
x = x + g
|
98 |
+
for i in range(self.n_layers):
|
99 |
+
y = self.convs_sep[i](x * x_mask)
|
100 |
+
y = self.norms_1[i](y)
|
101 |
+
y = F.gelu(y)
|
102 |
+
y = self.convs_1x1[i](y)
|
103 |
+
y = self.norms_2[i](y)
|
104 |
+
y = F.gelu(y)
|
105 |
+
y = self.drop(y)
|
106 |
+
x = x + y
|
107 |
+
return x * x_mask
|
108 |
+
|
109 |
+
|
110 |
+
class WN(torch.nn.Module):
|
111 |
+
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
|
112 |
+
super(WN, self).__init__()
|
113 |
+
assert(kernel_size % 2 == 1)
|
114 |
+
self.hidden_channels =hidden_channels
|
115 |
+
self.kernel_size = kernel_size,
|
116 |
+
self.dilation_rate = dilation_rate
|
117 |
+
self.n_layers = n_layers
|
118 |
+
self.gin_channels = gin_channels
|
119 |
+
self.p_dropout = p_dropout
|
120 |
+
|
121 |
+
self.in_layers = torch.nn.ModuleList()
|
122 |
+
self.res_skip_layers = torch.nn.ModuleList()
|
123 |
+
self.drop = nn.Dropout(p_dropout)
|
124 |
+
|
125 |
+
if gin_channels != 0:
|
126 |
+
cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
|
127 |
+
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
128 |
+
|
129 |
+
for i in range(n_layers):
|
130 |
+
dilation = dilation_rate ** i
|
131 |
+
padding = int((kernel_size * dilation - dilation) / 2)
|
132 |
+
in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
|
133 |
+
dilation=dilation, padding=padding)
|
134 |
+
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
135 |
+
self.in_layers.append(in_layer)
|
136 |
+
|
137 |
+
# last one is not necessary
|
138 |
+
if i < n_layers - 1:
|
139 |
+
res_skip_channels = 2 * hidden_channels
|
140 |
+
else:
|
141 |
+
res_skip_channels = hidden_channels
|
142 |
+
|
143 |
+
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
144 |
+
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
145 |
+
self.res_skip_layers.append(res_skip_layer)
|
146 |
+
|
147 |
+
def forward(self, x, x_mask, g=None, **kwargs):
|
148 |
+
output = torch.zeros_like(x)
|
149 |
+
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
150 |
+
|
151 |
+
if g is not None:
|
152 |
+
g = self.cond_layer(g)
|
153 |
+
|
154 |
+
for i in range(self.n_layers):
|
155 |
+
x_in = self.in_layers[i](x)
|
156 |
+
if g is not None:
|
157 |
+
cond_offset = i * 2 * self.hidden_channels
|
158 |
+
g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
|
159 |
+
else:
|
160 |
+
g_l = torch.zeros_like(x_in)
|
161 |
+
|
162 |
+
acts = commons.fused_add_tanh_sigmoid_multiply(
|
163 |
+
x_in,
|
164 |
+
g_l,
|
165 |
+
n_channels_tensor)
|
166 |
+
acts = self.drop(acts)
|
167 |
+
|
168 |
+
res_skip_acts = self.res_skip_layers[i](acts)
|
169 |
+
if i < self.n_layers - 1:
|
170 |
+
res_acts = res_skip_acts[:,:self.hidden_channels,:]
|
171 |
+
x = (x + res_acts) * x_mask
|
172 |
+
output = output + res_skip_acts[:,self.hidden_channels:,:]
|
173 |
+
else:
|
174 |
+
output = output + res_skip_acts
|
175 |
+
return output * x_mask
|
176 |
+
|
177 |
+
def remove_weight_norm(self):
|
178 |
+
if self.gin_channels != 0:
|
179 |
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
180 |
+
for l in self.in_layers:
|
181 |
+
torch.nn.utils.remove_weight_norm(l)
|
182 |
+
for l in self.res_skip_layers:
|
183 |
+
torch.nn.utils.remove_weight_norm(l)
|
184 |
+
|
185 |
+
|
186 |
+
class ResBlock1(torch.nn.Module):
|
187 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
188 |
+
super(ResBlock1, self).__init__()
|
189 |
+
self.convs1 = nn.ModuleList([
|
190 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
191 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
192 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
193 |
+
padding=get_padding(kernel_size, dilation[1]))),
|
194 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
195 |
+
padding=get_padding(kernel_size, dilation[2])))
|
196 |
+
])
|
197 |
+
self.convs1.apply(init_weights)
|
198 |
+
|
199 |
+
self.convs2 = nn.ModuleList([
|
200 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
201 |
+
padding=get_padding(kernel_size, 1))),
|
202 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
203 |
+
padding=get_padding(kernel_size, 1))),
|
204 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
205 |
+
padding=get_padding(kernel_size, 1)))
|
206 |
+
])
|
207 |
+
self.convs2.apply(init_weights)
|
208 |
+
|
209 |
+
def forward(self, x, x_mask=None):
|
210 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
211 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
212 |
+
if x_mask is not None:
|
213 |
+
xt = xt * x_mask
|
214 |
+
xt = c1(xt)
|
215 |
+
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
216 |
+
if x_mask is not None:
|
217 |
+
xt = xt * x_mask
|
218 |
+
xt = c2(xt)
|
219 |
+
x = xt + x
|
220 |
+
if x_mask is not None:
|
221 |
+
x = x * x_mask
|
222 |
+
return x
|
223 |
+
|
224 |
+
def remove_weight_norm(self):
|
225 |
+
for l in self.convs1:
|
226 |
+
remove_weight_norm(l)
|
227 |
+
for l in self.convs2:
|
228 |
+
remove_weight_norm(l)
|
229 |
+
|
230 |
+
|
231 |
+
class ResBlock2(torch.nn.Module):
|
232 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
233 |
+
super(ResBlock2, self).__init__()
|
234 |
+
self.convs = nn.ModuleList([
|
235 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
236 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
237 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
238 |
+
padding=get_padding(kernel_size, dilation[1])))
|
239 |
+
])
|
240 |
+
self.convs.apply(init_weights)
|
241 |
+
|
242 |
+
def forward(self, x, x_mask=None):
|
243 |
+
for c in self.convs:
|
244 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
245 |
+
if x_mask is not None:
|
246 |
+
xt = xt * x_mask
|
247 |
+
xt = c(xt)
|
248 |
+
x = xt + x
|
249 |
+
if x_mask is not None:
|
250 |
+
x = x * x_mask
|
251 |
+
return x
|
252 |
+
|
253 |
+
def remove_weight_norm(self):
|
254 |
+
for l in self.convs:
|
255 |
+
remove_weight_norm(l)
|
256 |
+
|
257 |
+
|
258 |
+
class Log(nn.Module):
|
259 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
260 |
+
if not reverse:
|
261 |
+
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
262 |
+
logdet = torch.sum(-y, [1, 2])
|
263 |
+
return y, logdet
|
264 |
+
else:
|
265 |
+
x = torch.exp(x) * x_mask
|
266 |
+
return x
|
267 |
+
|
268 |
+
|
269 |
+
class Flip(nn.Module):
|
270 |
+
def forward(self, x, *args, reverse=False, **kwargs):
|
271 |
+
x = torch.flip(x, [1])
|
272 |
+
if not reverse:
|
273 |
+
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
274 |
+
return x, logdet
|
275 |
+
else:
|
276 |
+
return x
|
277 |
+
|
278 |
+
|
279 |
+
class ElementwiseAffine(nn.Module):
|
280 |
+
def __init__(self, channels):
|
281 |
+
super().__init__()
|
282 |
+
self.channels = channels
|
283 |
+
self.m = nn.Parameter(torch.zeros(channels,1))
|
284 |
+
self.logs = nn.Parameter(torch.zeros(channels,1))
|
285 |
+
|
286 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
287 |
+
if not reverse:
|
288 |
+
y = self.m + torch.exp(self.logs) * x
|
289 |
+
y = y * x_mask
|
290 |
+
logdet = torch.sum(self.logs * x_mask, [1,2])
|
291 |
+
return y, logdet
|
292 |
+
else:
|
293 |
+
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
294 |
+
return x
|
295 |
+
|
296 |
+
|
297 |
+
class ResidualCouplingLayer(nn.Module):
|
298 |
+
def __init__(self,
|
299 |
+
channels,
|
300 |
+
hidden_channels,
|
301 |
+
kernel_size,
|
302 |
+
dilation_rate,
|
303 |
+
n_layers,
|
304 |
+
p_dropout=0,
|
305 |
+
gin_channels=0,
|
306 |
+
mean_only=False):
|
307 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
308 |
+
super().__init__()
|
309 |
+
self.channels = channels
|
310 |
+
self.hidden_channels = hidden_channels
|
311 |
+
self.kernel_size = kernel_size
|
312 |
+
self.dilation_rate = dilation_rate
|
313 |
+
self.n_layers = n_layers
|
314 |
+
self.half_channels = channels // 2
|
315 |
+
self.mean_only = mean_only
|
316 |
+
|
317 |
+
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
318 |
+
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
|
319 |
+
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
320 |
+
self.post.weight.data.zero_()
|
321 |
+
self.post.bias.data.zero_()
|
322 |
+
|
323 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
324 |
+
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
|
325 |
+
h = self.pre(x0) * x_mask
|
326 |
+
h = self.enc(h, x_mask, g=g)
|
327 |
+
stats = self.post(h) * x_mask
|
328 |
+
if not self.mean_only:
|
329 |
+
m, logs = torch.split(stats, [self.half_channels]*2, 1)
|
330 |
+
else:
|
331 |
+
m = stats
|
332 |
+
logs = torch.zeros_like(m)
|
333 |
+
|
334 |
+
if not reverse:
|
335 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
336 |
+
x = torch.cat([x0, x1], 1)
|
337 |
+
logdet = torch.sum(logs, [1,2])
|
338 |
+
return x, logdet
|
339 |
+
else:
|
340 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
341 |
+
x = torch.cat([x0, x1], 1)
|
342 |
+
return x
|
tts_voice.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tts_order_voice = {'英语 (美国)-Jenny-女': 'en-US-JennyNeural',
|
2 |
+
'英语 (美国)-Guy-男': 'en-US-GuyNeural',
|
3 |
+
'英语 (美国)-Ana-女': 'en-US-AnaNeural',
|
4 |
+
'英语 (美国)-Aria-女': 'en-US-AriaNeural',
|
5 |
+
'英语 (美国)-Christopher-男': 'en-US-ChristopherNeural',
|
6 |
+
'英语 (美国)-Eric-男': 'en-US-EricNeural',
|
7 |
+
'英语 (美国)-Michelle-女': 'en-US-MichelleNeural',
|
8 |
+
'英语 (美国)-Roger-男': 'en-US-RogerNeural',
|
9 |
+
'西班牙语 (墨西哥)-Dalia-女': 'es-MX-DaliaNeural',
|
10 |
+
'西班牙语 (墨西哥)-Jorge-男': 'es-MX-JorgeNeural',
|
11 |
+
'韩语 (韩国)-Sun-Hi-女': 'ko-KR-SunHiNeural',
|
12 |
+
'韩语 (韩国)-InJoon-男': 'ko-KR-InJoonNeural',
|
13 |
+
'泰语 (泰国)-Premwadee-女': 'th-TH-PremwadeeNeural',
|
14 |
+
'泰语 (泰国)-Niwat-男': 'th-TH-NiwatNeural',
|
15 |
+
'越南语 (越南)-HoaiMy-女': 'vi-VN-HoaiMyNeural',
|
16 |
+
'越南语 (越南)-NamMinh-男': 'vi-VN-NamMinhNeural',
|
17 |
+
'日语 (日本)-Nanami-女': 'ja-JP-NanamiNeural',
|
18 |
+
'日语 (日本)-Keita-男': 'ja-JP-KeitaNeural',
|
19 |
+
'法语 (法国)-Denise-女': 'fr-FR-DeniseNeural',
|
20 |
+
'法语 (法国)-Eloise-女': 'fr-FR-EloiseNeural',
|
21 |
+
'法语 (法国)-Henri-男': 'fr-FR-HenriNeural',
|
22 |
+
'葡萄牙语 (巴西)-Francisca-女': 'pt-BR-FranciscaNeural',
|
23 |
+
'葡萄牙语 (巴西)-Antonio-男': 'pt-BR-AntonioNeural',
|
24 |
+
'印度尼西亚语 (印度尼西亚)-Ardi-男': 'id-ID-ArdiNeural',
|
25 |
+
'印度尼西亚语 (印度尼西亚)-Gadis-女': 'id-ID-GadisNeural',
|
26 |
+
'希伯来语 (以色列)-Avri-男': 'he-IL-AvriNeural',
|
27 |
+
'希伯来语 (以色列)-Hila-女': 'he-IL-HilaNeural',
|
28 |
+
'意大利语 (意大利)-Isabella-女': 'it-IT-IsabellaNeural',
|
29 |
+
'意大利语 (意大利)-Diego-男': 'it-IT-DiegoNeural',
|
30 |
+
'意大利语 (意大利)-Elsa-女': 'it-IT-ElsaNeural',
|
31 |
+
'荷兰语 (荷兰)-Colette-女': 'nl-NL-ColetteNeural',
|
32 |
+
'荷兰语 (荷兰)-Fenna-女': 'nl-NL-FennaNeural',
|
33 |
+
'荷兰语 (荷兰)-Maarten-男': 'nl-NL-MaartenNeural',
|
34 |
+
'马来语 (马来西亚)-Osman-男': 'ms-MY-OsmanNeural',
|
35 |
+
'马来语 (马来西亚)-Yasmin-女': 'ms-MY-YasminNeural',
|
36 |
+
'挪威语 (挪威)-Pernille-女': 'nb-NO-PernilleNeural',
|
37 |
+
'挪威语 (挪威)-Finn-男': 'nb-NO-FinnNeural',
|
38 |
+
'瑞典语 (瑞典)-Sofie-女': 'sv-SE-SofieNeural',
|
39 |
+
'瑞典语 (瑞典)-Mattias-男': 'sv-SE-MattiasNeural',
|
40 |
+
'阿拉伯语 (沙特阿拉伯)-Hamed-男': 'ar-SA-HamedNeural',
|
41 |
+
'阿拉伯语 (沙特阿拉伯)-Zariyah-女': 'ar-SA-ZariyahNeural',
|
42 |
+
'希腊语 (希腊)-Athina-女': 'el-GR-AthinaNeural',
|
43 |
+
'希腊语 (希腊)-Nestoras-男': 'el-GR-NestorasNeural',
|
44 |
+
'德语 (德国)-Katja-女': 'de-DE-KatjaNeural',
|
45 |
+
'德语 (德国)-Amala-女': 'de-DE-AmalaNeural',
|
46 |
+
'德语 (德国)-Conrad-男': 'de-DE-ConradNeural',
|
47 |
+
'德语 (德国)-Killian-男': 'de-DE-KillianNeural',
|
48 |
+
'阿拉伯语 (南非)-Adri-女': 'af-ZA-AdriNeural',
|
49 |
+
'阿拉伯语 (南非)-Willem-男': 'af-ZA-WillemNeural',
|
50 |
+
'阿姆哈拉语 (埃塞俄比亚)-Ameha-男': 'am-ET-AmehaNeural',
|
51 |
+
'阿姆哈拉语 (埃塞俄比亚)-Mekdes-女': 'am-ET-MekdesNeural',
|
52 |
+
'阿拉伯语 (阿拉伯联合酋长国)-Fatima-女': 'ar-AE-FatimaNeural',
|
53 |
+
'阿拉伯语 (阿拉伯联合酋长国)-Hamdan-男': 'ar-AE-HamdanNeural',
|
54 |
+
'阿拉伯语 (巴林)-Ali-男': 'ar-BH-AliNeural',
|
55 |
+
'阿拉伯语 (巴林)-Laila-女': 'ar-BH-LailaNeural',
|
56 |
+
'阿拉伯语 (阿尔及利亚)-Ismael-男': 'ar-DZ-IsmaelNeural',
|
57 |
+
'阿拉伯语 (埃及)-Salma-女': 'ar-EG-SalmaNeural',
|
58 |
+
'阿拉伯语 (埃及)-Shakir-男': 'ar-EG-ShakirNeural',
|
59 |
+
'阿拉伯语 (伊拉克)-Bassel-男': 'ar-IQ-BasselNeural',
|
60 |
+
'阿拉伯语 (伊拉克)-Rana-女': 'ar-IQ-RanaNeural',
|
61 |
+
'阿拉伯语 (约旦)-Sana-女': 'ar-JO-SanaNeural',
|
62 |
+
'阿拉伯语 (约旦)-Taim-男': 'ar-JO-TaimNeural',
|
63 |
+
'阿拉伯语 (科威特)-Fahed-男': 'ar-KW-FahedNeural',
|
64 |
+
'阿拉伯语 (科威特)-Noura-女': 'ar-KW-NouraNeural',
|
65 |
+
'阿拉伯语 (黎巴嫩)-Layla-女': 'ar-LB-LaylaNeural',
|
66 |
+
'阿拉伯语 (黎巴嫩)-Rami-男': 'ar-LB-RamiNeural',
|
67 |
+
'阿拉伯语 (利比亚)-Iman-女': 'ar-LY-ImanNeural',
|
68 |
+
'阿拉伯语 (利比亚)-Omar-男': 'ar-LY-OmarNeural',
|
69 |
+
'阿拉伯语 (摩洛哥)-Jamal-男': 'ar-MA-JamalNeural',
|
70 |
+
'阿拉伯语 (摩洛哥)-Mouna-女': 'ar-MA-MounaNeural',
|
71 |
+
'阿拉伯语 (阿曼)-Abdullah-男': 'ar-OM-AbdullahNeural',
|
72 |
+
'阿拉伯语 (阿曼)-Aysha-女': 'ar-OM-AyshaNeural',
|
73 |
+
'阿拉伯语 (卡塔尔)-Amal-女': 'ar-QA-AmalNeural',
|
74 |
+
'阿拉伯语 (卡塔尔)-Moaz-男': 'ar-QA-MoazNeural',
|
75 |
+
'阿拉伯语 (叙利亚)-Amany-女': 'ar-SY-AmanyNeural',
|
76 |
+
'阿拉伯语 (叙利亚)-Laith-男': 'ar-SY-LaithNeural',
|
77 |
+
'阿拉伯语 (突尼斯)-Hedi-男': 'ar-TN-HediNeural',
|
78 |
+
'阿拉伯语 (突尼斯)-Reem-女': 'ar-TN-ReemNeural',
|
79 |
+
'阿拉伯语 (也门)-Maryam-女': 'ar-YE-MaryamNeural',
|
80 |
+
'阿拉伯语 (也门)-Saleh-男': 'ar-YE-SalehNeural',
|
81 |
+
'阿塞拜疆语 (阿塞拜疆)-Babek-男': 'az-AZ-BabekNeural',
|
82 |
+
'阿塞拜疆语 (阿塞拜疆)-Banu-女': 'az-AZ-BanuNeural',
|
83 |
+
'保加利亚语 (保加利亚)-Borislav-男': 'bg-BG-BorislavNeural',
|
84 |
+
'保加利亚语 (保加利亚)-Kalina-女': 'bg-BG-KalinaNeural',
|
85 |
+
'孟加拉语 (孟加拉国)-Nabanita-女': 'bn-BD-NabanitaNeural',
|
86 |
+
'孟加拉语 (孟加拉国)-Pradeep-男': 'bn-BD-PradeepNeural',
|
87 |
+
'孟加拉语 (印度)-Bashkar-男': 'bn-IN-BashkarNeural',
|
88 |
+
'孟加拉语 (印度)-Tanishaa-女': 'bn-IN-TanishaaNeural',
|
89 |
+
'波斯尼亚语 (波斯尼亚和黑塞哥维那)-Goran-男': 'bs-BA-GoranNeural',
|
90 |
+
'波斯尼亚语 (波斯尼亚和黑塞哥维那)-Vesna-女': 'bs-BA-VesnaNeural',
|
91 |
+
'加泰罗尼亚语 (西班牙)-Joana-女': 'ca-ES-JoanaNeural',
|
92 |
+
'加泰罗尼亚语 (西班牙)-Enric-男': 'ca-ES-EnricNeural',
|
93 |
+
'捷克语 (捷克共和国)-Antonin-男': 'cs-CZ-AntoninNeural',
|
94 |
+
'捷克语 (捷克共和国)-Vlasta-女': 'cs-CZ-VlastaNeural',
|
95 |
+
'威尔士语 (英国)-Aled-男': 'cy-GB-AledNeural',
|
96 |
+
'威尔士语 (英国)-Nia-女': 'cy-GB-NiaNeural',
|
97 |
+
'丹麦语 (丹麦)-Christel-女': 'da-DK-ChristelNeural',
|
98 |
+
'丹麦语 (丹麦)-Jeppe-男': 'da-DK-JeppeNeural',
|
99 |
+
'德语 (奥地利)-Ingrid-女': 'de-AT-IngridNeural',
|
100 |
+
'德语 (奥地利)-Jonas-男': 'de-AT-JonasNeural',
|
101 |
+
'德语 (瑞士)-Jan-男': 'de-CH-JanNeural',
|
102 |
+
'德语 (瑞士)-Leni-女': 'de-CH-LeniNeural',
|
103 |
+
'英语 (澳大利亚)-Natasha-女': 'en-AU-NatashaNeural',
|
104 |
+
'英语 (澳大利亚)-William-男': 'en-AU-WilliamNeural',
|
105 |
+
'英语 (加拿大)-Clara-女': 'en-CA-ClaraNeural',
|
106 |
+
'英语 (加拿大)-Liam-男': 'en-CA-LiamNeural',
|
107 |
+
'英语 (英国)-Libby-女': 'en-GB-LibbyNeural',
|
108 |
+
'英语 (英国)-Maisie-女': 'en-GB-MaisieNeural',
|
109 |
+
'英语 (英国)-Ryan-男': 'en-GB-RyanNeural',
|
110 |
+
'英语 (英国)-Sonia-女': 'en-GB-SoniaNeural',
|
111 |
+
'英语 (英国)-Thomas-男': 'en-GB-ThomasNeural',
|
112 |
+
'英语 (香港)-Sam-男': 'en-HK-SamNeural',
|
113 |
+
'英语 (香港)-Yan-女': 'en-HK-YanNeural',
|
114 |
+
'英语 (爱尔兰)-Connor-男': 'en-IE-ConnorNeural',
|
115 |
+
'英语 (爱尔兰)-Emily-女': 'en-IE-EmilyNeural',
|
116 |
+
'英语 (印度)-Neerja-女': 'en-IN-NeerjaNeural',
|
117 |
+
'英语 (印度)-Prabhat-男': 'en-IN-PrabhatNeural',
|
118 |
+
'英语 (肯尼亚)-Asilia-女': 'en-KE-AsiliaNeural',
|
119 |
+
'英语 (肯尼亚)-Chilemba-男': 'en-KE-ChilembaNeural',
|
120 |
+
'英语 (尼日利亚)-Abeo-男': 'en-NG-AbeoNeural',
|
121 |
+
'英语 (尼日利亚)-Ezinne-女': 'en-NG-EzinneNeural',
|
122 |
+
'英语 (新西兰)-Mitchell-男': 'en-NZ-MitchellNeural',
|
123 |
+
'英语 (菲律宾)-James-男': 'en-PH-JamesNeural',
|
124 |
+
'英语 (菲律宾)-Rosa-女': 'en-PH-RosaNeural',
|
125 |
+
'英语 (新加坡)-Luna-女': 'en-SG-LunaNeural',
|
126 |
+
'英语 (新加坡)-Wayne-男': 'en-SG-WayneNeural',
|
127 |
+
'英语 (坦桑尼亚)-Elimu-男': 'en-TZ-ElimuNeural',
|
128 |
+
'英语 (坦桑尼亚)-Imani-女': 'en-TZ-ImaniNeural',
|
129 |
+
'英语 (南非)-Leah-女': 'en-ZA-LeahNeural',
|
130 |
+
'英语 (南非)-Luke-男': 'en-ZA-LukeNeural',
|
131 |
+
'西班牙语 (阿根廷)-Elena-女': 'es-AR-ElenaNeural',
|
132 |
+
'西班牙语 (阿根廷)-Tomas-男': 'es-AR-TomasNeural',
|
133 |
+
'西班牙语 (玻利维亚)-Marcelo-男': 'es-BO-MarceloNeural',
|
134 |
+
'西班牙语 (玻利维亚)-Sofia-女': 'es-BO-SofiaNeural',
|
135 |
+
'西班牙语 (哥伦比亚)-Gonzalo-男': 'es-CO-GonzaloNeural',
|
136 |
+
'西班牙语 (哥伦比亚)-Salome-女': 'es-CO-SalomeNeural',
|
137 |
+
'西班牙语 (哥斯达黎加)-Juan-男': 'es-CR-JuanNeural',
|
138 |
+
'西班牙语 (哥斯达黎加)-Maria-女': 'es-CR-MariaNeural',
|
139 |
+
'西班牙语 (古巴)-Belkys-女': 'es-CU-BelkysNeural',
|
140 |
+
'西班牙语 (多米尼加共和国)-Emilio-男': 'es-DO-EmilioNeural',
|
141 |
+
'西班牙语 (多米尼加共和国)-Ramona-女': 'es-DO-RamonaNeural',
|
142 |
+
'西班牙语 (厄瓜多尔)-Andrea-女': 'es-EC-AndreaNeural',
|
143 |
+
'西班牙语 (厄瓜多尔)-Luis-男': 'es-EC-LuisNeural',
|
144 |
+
'西班牙语 (西班牙)-Alvaro-男': 'es-ES-AlvaroNeural',
|
145 |
+
'西班牙语 (西班牙)-Elvira-女': 'es-ES-ElviraNeural',
|
146 |
+
'西班牙语 (赤道几内亚)-Teresa-女': 'es-GQ-TeresaNeural',
|
147 |
+
'西班牙语 (危地马拉)-Andres-男': 'es-GT-AndresNeural',
|
148 |
+
'西班牙语 (危地马拉)-Marta-女': 'es-GT-MartaNeural',
|
149 |
+
'西班牙语 (洪都拉斯)-Carlos-男': 'es-HN-CarlosNeural',
|
150 |
+
'西班牙语 (洪都拉斯)-Karla-女': 'es-HN-KarlaNeural',
|
151 |
+
'西班牙语 (尼加拉瓜)-Federico-男': 'es-NI-FedericoNeural',
|
152 |
+
'西班牙语 (尼加拉瓜)-Yolanda-女': 'es-NI-YolandaNeural',
|
153 |
+
'西班牙语 (巴拿马)-Margarita-女': 'es-PA-MargaritaNeural',
|
154 |
+
'西班牙语 (巴拿马)-Roberto-男': 'es-PA-RobertoNeural',
|
155 |
+
'西班牙语 (秘鲁)-Alex-男': 'es-PE-AlexNeural',
|
156 |
+
'西班牙语 (秘鲁)-Camila-女': 'es-PE-CamilaNeural',
|
157 |
+
'西班牙语 (波多黎各)-Karina-女': 'es-PR-KarinaNeural',
|
158 |
+
'西班牙语 (波多黎各)-Victor-男': 'es-PR-VictorNeural',
|
159 |
+
'西班牙语 (巴拉圭)-Mario-男': 'es-PY-MarioNeural',
|
160 |
+
'西班牙语 (巴拉圭)-Tania-女': 'es-PY-TaniaNeural',
|
161 |
+
'西班牙语 (萨尔瓦多)-Lorena-女': 'es-SV-LorenaNeural',
|
162 |
+
'西班牙语 (萨尔瓦多)-Rodrigo-男': 'es-SV-RodrigoNeural',
|
163 |
+
'西班牙语 (美国)-Alonso-男': 'es-US-AlonsoNeural',
|
164 |
+
'西班牙语 (美国)-Paloma-女': 'es-US-PalomaNeural',
|
165 |
+
'西班牙语 (乌拉圭)-Mateo-男': 'es-UY-MateoNeural',
|
166 |
+
'西班牙语 (乌拉圭)-Valentina-女': 'es-UY-ValentinaNeural',
|
167 |
+
'西班牙语 (委内瑞拉)-Paola-女': 'es-VE-PaolaNeural',
|
168 |
+
'西班牙语 (委内瑞拉)-Sebastian-男': 'es-VE-SebastianNeural',
|
169 |
+
'爱沙尼亚语 (爱沙尼亚)-Anu-���': 'et-EE-AnuNeural',
|
170 |
+
'爱沙尼亚语 (爱沙尼亚)-Kert-男': 'et-EE-KertNeural',
|
171 |
+
'波斯语 (伊朗)-Dilara-女': 'fa-IR-DilaraNeural',
|
172 |
+
'波斯语 (伊朗)-Farid-男': 'fa-IR-FaridNeural',
|
173 |
+
'芬兰语 (芬兰)-Harri-男': 'fi-FI-HarriNeural',
|
174 |
+
'芬兰语 (芬兰)-Noora-女': 'fi-FI-NooraNeural',
|
175 |
+
'法语 (比利时)-Charline-女': 'fr-BE-CharlineNeural',
|
176 |
+
'法语 (比利时)-Gerard-男': 'fr-BE-GerardNeural',
|
177 |
+
'法语 (加拿大)-Sylvie-女': 'fr-CA-SylvieNeural',
|
178 |
+
'法语 (加拿大)-Antoine-男': 'fr-CA-AntoineNeural',
|
179 |
+
'法语 (加拿大)-Jean-男': 'fr-CA-JeanNeural',
|
180 |
+
'法语 (瑞士)-Ariane-女': 'fr-CH-ArianeNeural',
|
181 |
+
'法语 (瑞士)-Fabrice-男': 'fr-CH-FabriceNeural',
|
182 |
+
'爱尔兰语 (爱尔兰)-Colm-男': 'ga-IE-ColmNeural',
|
183 |
+
'爱尔兰语 (爱尔兰)-Orla-女': 'ga-IE-OrlaNeural',
|
184 |
+
'加利西亚语 (西班牙)-Roi-男': 'gl-ES-RoiNeural',
|
185 |
+
'加利西亚语 (西班牙)-Sabela-女': 'gl-ES-SabelaNeural',
|
186 |
+
'古吉拉特语 (印度)-Dhwani-女': 'gu-IN-DhwaniNeural',
|
187 |
+
'古吉拉特语 (印度)-Niranjan-男': 'gu-IN-NiranjanNeural',
|
188 |
+
'印地语 (印度)-Madhur-男': 'hi-IN-MadhurNeural',
|
189 |
+
'印地语 (印度)-Swara-女': 'hi-IN-SwaraNeural',
|
190 |
+
'克罗地亚语 (克罗地亚)-Gabrijela-女': 'hr-HR-GabrijelaNeural',
|
191 |
+
'克罗地亚语 (克罗地亚)-Srecko-男': 'hr-HR-SreckoNeural',
|
192 |
+
'匈牙利语 (匈牙利)-Noemi-女': 'hu-HU-NoemiNeural',
|
193 |
+
'匈牙利语 (匈牙利)-Tamas-男': 'hu-HU-TamasNeural',
|
194 |
+
'冰岛语 (冰岛)-Gudrun-女': 'is-IS-GudrunNeural',
|
195 |
+
'冰岛语 (冰岛)-Gunnar-男': 'is-IS-GunnarNeural',
|
196 |
+
'爪哇语 (印度尼西亚)-Dimas-男': 'jv-ID-DimasNeural',
|
197 |
+
'爪哇语 (印度尼西亚)-Siti-女': 'jv-ID-SitiNeural',
|
198 |
+
'格鲁吉亚语 (格鲁吉亚)-Eka-女': 'ka-GE-EkaNeural',
|
199 |
+
'格鲁吉亚语 (格鲁吉亚)-Giorgi-男': 'ka-GE-GiorgiNeural',
|
200 |
+
'哈萨克语 (哈萨克斯坦)-Aigul-女': 'kk-KZ-AigulNeural',
|
201 |
+
'哈萨克语 (哈萨克斯坦)-Daulet-男': 'kk-KZ-DauletNeural',
|
202 |
+
'高棉语 (柬埔寨)-Piseth-男': 'km-KH-PisethNeural',
|
203 |
+
'高棉语 (柬埔寨)-Sreymom-女': 'km-KH-SreymomNeural',
|
204 |
+
'卡纳达语 (印度)-Gagan-男': 'kn-IN-GaganNeural',
|
205 |
+
'卡纳达语 (印度)-Sapna-女': 'kn-IN-SapnaNeural',
|
206 |
+
'老挝语 (老挝)-Chanthavong-男': 'lo-LA-ChanthavongNeural',
|
207 |
+
'老挝语 (老挝)-Keomany-女': 'lo-LA-KeomanyNeural',
|
208 |
+
'立陶宛语 (立陶宛)-Leonas-男': 'lt-LT-LeonasNeural',
|
209 |
+
'立陶宛语 (立陶宛)-Ona-女': 'lt-LT-OnaNeural',
|
210 |
+
'拉脱维亚语 (拉脱维亚)-Everita-女': 'lv-LV-EveritaNeural',
|
211 |
+
'拉脱维亚语 (拉脱维亚)-Nils-男': 'lv-LV-NilsNeural',
|
212 |
+
'马其顿语 (北马其顿共和国)-Aleksandar-男': 'mk-MK-AleksandarNeural',
|
213 |
+
'马其顿语 (北马其顿共和国)-Marija-女': 'mk-MK-MarijaNeural',
|
214 |
+
'马拉雅拉姆语 (印度)-Midhun-男': 'ml-IN-MidhunNeural',
|
215 |
+
'马拉雅拉姆语 (印度)-Sobhana-女': 'ml-IN-SobhanaNeural',
|
216 |
+
'蒙古语 (蒙古)-Bataa-男': 'mn-MN-BataaNeural',
|
217 |
+
'蒙古语 (蒙古)-Yesui-女': 'mn-MN-YesuiNeural',
|
218 |
+
'马拉地语 (印度)-Aarohi-女': 'mr-IN-AarohiNeural',
|
219 |
+
'马拉地语 (印度)-Manohar-男': 'mr-IN-ManoharNeural',
|
220 |
+
'马耳他语 (马耳他)-Grace-女': 'mt-MT-GraceNeural',
|
221 |
+
'马耳他语 (马耳他)-Joseph-男': 'mt-MT-JosephNeural',
|
222 |
+
'缅甸语 (缅甸)-Nilar-女': 'my-MM-NilarNeural',
|
223 |
+
'缅甸语 (缅甸)-Thiha-男': 'my-MM-ThihaNeural',
|
224 |
+
'尼泊尔语 (尼泊尔)-Hemkala-女': 'ne-NP-HemkalaNeural',
|
225 |
+
'尼泊尔语 (尼泊尔)-Sagar-男': 'ne-NP-SagarNeural',
|
226 |
+
'荷兰语 (比利时)-Arnaud-男': 'nl-BE-ArnaudNeural',
|
227 |
+
'荷兰语 (比利时)-Dena-女': 'nl-BE-DenaNeural',
|
228 |
+
'波兰语 (波兰)-Marek-男': 'pl-PL-MarekNeural',
|
229 |
+
'波兰语 (波兰)-Zofia-女': 'pl-PL-ZofiaNeural',
|
230 |
+
'普什图语 (阿富汗)-Gul Nawaz-男': 'ps-AF-GulNawazNeural',
|
231 |
+
'普什图语 (阿富汗)-Latifa-女': 'ps-AF-LatifaNeural',
|
232 |
+
'葡萄牙语 (葡萄牙)-Duarte-男': 'pt-PT-DuarteNeural',
|
233 |
+
'葡萄牙语 (葡萄牙)-Raquel-女': 'pt-PT-RaquelNeural',
|
234 |
+
'罗马尼亚语 (罗马尼亚)-Alina-女': 'ro-RO-AlinaNeural',
|
235 |
+
'罗马尼亚语 (罗马尼亚)-Emil-男': 'ro-RO-EmilNeural',
|
236 |
+
'俄语 (俄罗斯)-Svetlana-女': 'ru-RU-SvetlanaNeural',
|
237 |
+
'俄语 (俄罗斯)-Dmitry-男': 'ru-RU-DmitryNeural',
|
238 |
+
'僧伽罗语 (斯里兰卡)-Sameera-男': 'si-LK-SameeraNeural',
|
239 |
+
'僧伽罗语 (斯里兰卡)-Thilini-女': 'si-LK-ThiliniNeural',
|
240 |
+
'斯洛伐克语 (斯洛伐克)-Lukas-男': 'sk-SK-LukasNeural',
|
241 |
+
'斯洛伐克语 (斯洛伐克)-Viktoria-女': 'sk-SK-ViktoriaNeural',
|
242 |
+
'斯洛文尼亚语 (斯洛文尼亚)-Petra-女': 'sl-SI-PetraNeural',
|
243 |
+
'斯洛文尼亚语 (斯洛文尼亚)-Rok-男': 'sl-SI-RokNeural',
|
244 |
+
'索马里语 (索马里)-Muuse-男': 'so-SO-MuuseNeural',
|
245 |
+
'索马里语 (索马里)-Ubax-女': 'so-SO-UbaxNeural',
|
246 |
+
'阿尔巴尼亚语 (阿尔巴尼亚)-Anila-女': 'sq-AL-AnilaNeural',
|
247 |
+
'阿尔巴尼亚语 (阿尔巴尼亚)-Ilir-男': 'sq-AL-IlirNeural',
|
248 |
+
'塞尔维亚语 (塞尔维亚)-Nicholas-男': 'sr-RS-NicholasNeural',
|
249 |
+
'塞尔维亚语 (塞尔维亚)-Sophie-女': 'sr-RS-SophieNeural',
|
250 |
+
'巽他语 (印度尼西亚)-Jajang-男': 'su-ID-JajangNeural',
|
251 |
+
'巽他语 (印度尼��亚)-Tuti-女': 'su-ID-TutiNeural',
|
252 |
+
'斯瓦希里语 (肯尼亚)-Rafiki-男': 'sw-KE-RafikiNeural',
|
253 |
+
'斯瓦希里语 (肯尼亚)-Zuri-女': 'sw-KE-ZuriNeural',
|
254 |
+
'斯瓦希里语 (坦桑尼亚)-Daudi-男': 'sw-TZ-DaudiNeural',
|
255 |
+
'斯瓦希里语 (坦桑尼亚)-Rehema-女': 'sw-TZ-RehemaNeural',
|
256 |
+
'泰米尔语 (印度)-Pallavi-女': 'ta-IN-PallaviNeural',
|
257 |
+
'泰米尔语 (印度)-Valluvar-男': 'ta-IN-ValluvarNeural',
|
258 |
+
'泰米尔语 (斯里兰卡)-Kumar-男': 'ta-LK-KumarNeural',
|
259 |
+
'泰米尔语 (斯里兰卡)-Saranya-女': 'ta-LK-SaranyaNeural',
|
260 |
+
'泰米尔语 (马来西亚)-Kani-女': 'ta-MY-KaniNeural',
|
261 |
+
'泰米尔语 (马来西亚)-Surya-男': 'ta-MY-SuryaNeural',
|
262 |
+
'泰米尔语 (新加坡)-Anbu-男': 'ta-SG-AnbuNeural',
|
263 |
+
'泰卢固语 (印度)-Mohan-男': 'te-IN-MohanNeural',
|
264 |
+
'泰卢固语 (印度)-Shruti-女': 'te-IN-ShrutiNeural',
|
265 |
+
'土耳其语 (土耳其)-Ahmet-男': 'tr-TR-AhmetNeural',
|
266 |
+
'土耳其语 (土耳其)-Emel-女': 'tr-TR-EmelNeural',
|
267 |
+
'乌克兰语 (乌克兰)-Ostap-男': 'uk-UA-OstapNeural',
|
268 |
+
'乌克兰语 (乌克兰)-Polina-女': 'uk-UA-PolinaNeural',
|
269 |
+
'乌尔都语 (印度)-Gul-女': 'ur-IN-GulNeural',
|
270 |
+
'乌尔都语 (印度)-Salman-男': 'ur-IN-SalmanNeural',
|
271 |
+
'乌尔都语 (巴基斯坦)-Asad-男': 'ur-PK-AsadNeural',
|
272 |
+
'乌尔都语 (巴基斯坦)-Uzma-女': 'ur-PK-UzmaNeural',
|
273 |
+
'乌兹别克语 (乌兹别克斯坦)-Madina-女': 'uz-UZ-MadinaNeural',
|
274 |
+
'乌兹别克语 (乌兹别克斯坦)-Sardor-男': 'uz-UZ-SardorNeural',
|
275 |
+
'普通话 (中国大陆)-Xiaoxiao-女': 'zh-CN-XiaoxiaoNeural',
|
276 |
+
'普通话 (中国大陆)-Yunyang-男': 'zh-CN-YunyangNeural',
|
277 |
+
'普通话 (中国大陆)-Yunxi-男': 'zh-CN-YunxiNeural',
|
278 |
+
'普通话 (中国大陆)-Xiaoyi-女': 'zh-CN-XiaoyiNeural',
|
279 |
+
'普通话 (中国大陆)-Yunjian-男': 'zh-CN-YunjianNeural',
|
280 |
+
'普通话 (中国大陆)-Yunxia-男': 'zh-CN-YunxiaNeural',
|
281 |
+
'东北话 (中国大陆)-Xiaobei-女': 'zh-CN-liaoning-XiaobeiNeural',
|
282 |
+
'中原官话 (中国陕西)-Xiaoni-女': 'zh-CN-shaanxi-XiaoniNeural',
|
283 |
+
'粤语 (中国香港)-HiuMaan-女': 'zh-HK-HiuMaanNeural',
|
284 |
+
'粤语 (中国香港)-HiuGaai-女': 'zh-HK-HiuGaaiNeural',
|
285 |
+
'粤语 (中国香港)-WanLung-男': 'zh-HK-WanLungNeural',
|
286 |
+
'台湾普通话-HsiaoChen-女': 'zh-TW-HsiaoChenNeural',
|
287 |
+
'台湾普通话-HsiaoYu-女': 'zh-TW-HsiaoYuNeural',
|
288 |
+
'台湾普通话-YunJhe-男': 'zh-TW-YunJheNeural',
|
289 |
+
'祖鲁语 (南非)-Thando-女': 'zu-ZA-ThandoNeural',
|
290 |
+
'祖鲁语 (南非)-Themba-男': 'zu-ZA-ThembaNeural'}
|
utils.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import argparse
|
4 |
+
import logging
|
5 |
+
import json
|
6 |
+
import subprocess
|
7 |
+
import numpy as np
|
8 |
+
from scipy.io.wavfile import read
|
9 |
+
import torch
|
10 |
+
from torch.nn import functional as F
|
11 |
+
from commons import sequence_mask
|
12 |
+
|
13 |
+
MATPLOTLIB_FLAG = False
|
14 |
+
|
15 |
+
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
16 |
+
logger = logging
|
17 |
+
|
18 |
+
|
19 |
+
def get_cmodel(rank):
|
20 |
+
checkpoint = torch.load('wavlm/WavLM-Large.pt')
|
21 |
+
cfg = WavLMConfig(checkpoint['cfg'])
|
22 |
+
cmodel = WavLM(cfg).cuda(rank)
|
23 |
+
cmodel.load_state_dict(checkpoint['model'])
|
24 |
+
cmodel.eval()
|
25 |
+
return cmodel
|
26 |
+
|
27 |
+
|
28 |
+
def get_content(cmodel, y):
|
29 |
+
with torch.no_grad():
|
30 |
+
c = cmodel.extract_features(y.squeeze(1))[0]
|
31 |
+
c = c.transpose(1, 2)
|
32 |
+
return c
|
33 |
+
|
34 |
+
|
35 |
+
def get_vocoder(rank):
|
36 |
+
with open("hifigan/config.json", "r") as f:
|
37 |
+
config = json.load(f)
|
38 |
+
config = hifigan.AttrDict(config)
|
39 |
+
vocoder = hifigan.Generator(config)
|
40 |
+
ckpt = torch.load("hifigan/generator_v1")
|
41 |
+
vocoder.load_state_dict(ckpt["generator"])
|
42 |
+
vocoder.eval()
|
43 |
+
vocoder.remove_weight_norm()
|
44 |
+
vocoder.cuda(rank)
|
45 |
+
return vocoder
|
46 |
+
|
47 |
+
|
48 |
+
def transform(mel, height): # 68-92
|
49 |
+
#r = np.random.random()
|
50 |
+
#rate = r * 0.3 + 0.85 # 0.85-1.15
|
51 |
+
#height = int(mel.size(-2) * rate)
|
52 |
+
tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
|
53 |
+
if height >= mel.size(-2):
|
54 |
+
return tgt[:, :mel.size(-2), :]
|
55 |
+
else:
|
56 |
+
silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
|
57 |
+
silence += torch.randn_like(silence) / 10
|
58 |
+
return torch.cat((tgt, silence), 1)
|
59 |
+
|
60 |
+
|
61 |
+
def stretch(mel, width): # 0.5-2
|
62 |
+
return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
|
63 |
+
|
64 |
+
|
65 |
+
def load_checkpoint(checkpoint_path, model, optimizer=None):
|
66 |
+
assert os.path.isfile(checkpoint_path)
|
67 |
+
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
68 |
+
iteration = checkpoint_dict['iteration']
|
69 |
+
learning_rate = checkpoint_dict['learning_rate']
|
70 |
+
if optimizer is not None:
|
71 |
+
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
72 |
+
saved_state_dict = checkpoint_dict['model']
|
73 |
+
if hasattr(model, 'module'):
|
74 |
+
state_dict = model.module.state_dict()
|
75 |
+
else:
|
76 |
+
state_dict = model.state_dict()
|
77 |
+
new_state_dict= {}
|
78 |
+
for k, v in state_dict.items():
|
79 |
+
try:
|
80 |
+
new_state_dict[k] = saved_state_dict[k]
|
81 |
+
except:
|
82 |
+
logger.info("%s is not in the checkpoint" % k)
|
83 |
+
new_state_dict[k] = v
|
84 |
+
if hasattr(model, 'module'):
|
85 |
+
model.module.load_state_dict(new_state_dict)
|
86 |
+
else:
|
87 |
+
model.load_state_dict(new_state_dict)
|
88 |
+
logger.info("Loaded checkpoint '{}' (iteration {})" .format(
|
89 |
+
checkpoint_path, iteration))
|
90 |
+
return model, optimizer, learning_rate, iteration
|
91 |
+
|
92 |
+
|
93 |
+
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
|
94 |
+
logger.info("Saving model and optimizer state at iteration {} to {}".format(
|
95 |
+
iteration, checkpoint_path))
|
96 |
+
if hasattr(model, 'module'):
|
97 |
+
state_dict = model.module.state_dict()
|
98 |
+
else:
|
99 |
+
state_dict = model.state_dict()
|
100 |
+
torch.save({'model': state_dict,
|
101 |
+
'iteration': iteration,
|
102 |
+
'optimizer': optimizer.state_dict(),
|
103 |
+
'learning_rate': learning_rate}, checkpoint_path)
|
104 |
+
|
105 |
+
|
106 |
+
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
107 |
+
for k, v in scalars.items():
|
108 |
+
writer.add_scalar(k, v, global_step)
|
109 |
+
for k, v in histograms.items():
|
110 |
+
writer.add_histogram(k, v, global_step)
|
111 |
+
for k, v in images.items():
|
112 |
+
writer.add_image(k, v, global_step, dataformats='HWC')
|
113 |
+
for k, v in audios.items():
|
114 |
+
writer.add_audio(k, v, global_step, audio_sampling_rate)
|
115 |
+
|
116 |
+
|
117 |
+
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
|
118 |
+
f_list = glob.glob(os.path.join(dir_path, regex))
|
119 |
+
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
|
120 |
+
x = f_list[-1]
|
121 |
+
print(x)
|
122 |
+
return x
|
123 |
+
|
124 |
+
|
125 |
+
def plot_spectrogram_to_numpy(spectrogram):
|
126 |
+
global MATPLOTLIB_FLAG
|
127 |
+
if not MATPLOTLIB_FLAG:
|
128 |
+
import matplotlib
|
129 |
+
matplotlib.use("Agg")
|
130 |
+
MATPLOTLIB_FLAG = True
|
131 |
+
mpl_logger = logging.getLogger('matplotlib')
|
132 |
+
mpl_logger.setLevel(logging.WARNING)
|
133 |
+
import matplotlib.pylab as plt
|
134 |
+
import numpy as np
|
135 |
+
|
136 |
+
fig, ax = plt.subplots(figsize=(10,2))
|
137 |
+
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
138 |
+
interpolation='none')
|
139 |
+
plt.colorbar(im, ax=ax)
|
140 |
+
plt.xlabel("Frames")
|
141 |
+
plt.ylabel("Channels")
|
142 |
+
plt.tight_layout()
|
143 |
+
|
144 |
+
fig.canvas.draw()
|
145 |
+
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
146 |
+
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
147 |
+
plt.close()
|
148 |
+
return data
|
149 |
+
|
150 |
+
|
151 |
+
def plot_alignment_to_numpy(alignment, info=None):
|
152 |
+
global MATPLOTLIB_FLAG
|
153 |
+
if not MATPLOTLIB_FLAG:
|
154 |
+
import matplotlib
|
155 |
+
matplotlib.use("Agg")
|
156 |
+
MATPLOTLIB_FLAG = True
|
157 |
+
mpl_logger = logging.getLogger('matplotlib')
|
158 |
+
mpl_logger.setLevel(logging.WARNING)
|
159 |
+
import matplotlib.pylab as plt
|
160 |
+
import numpy as np
|
161 |
+
|
162 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
163 |
+
im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
|
164 |
+
interpolation='none')
|
165 |
+
fig.colorbar(im, ax=ax)
|
166 |
+
xlabel = 'Decoder timestep'
|
167 |
+
if info is not None:
|
168 |
+
xlabel += '\n\n' + info
|
169 |
+
plt.xlabel(xlabel)
|
170 |
+
plt.ylabel('Encoder timestep')
|
171 |
+
plt.tight_layout()
|
172 |
+
|
173 |
+
fig.canvas.draw()
|
174 |
+
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
175 |
+
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
176 |
+
plt.close()
|
177 |
+
return data
|
178 |
+
|
179 |
+
|
180 |
+
def load_wav_to_torch(full_path):
|
181 |
+
sampling_rate, data = read(full_path)
|
182 |
+
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
183 |
+
|
184 |
+
|
185 |
+
def load_filepaths_and_text(filename, split="|"):
|
186 |
+
with open(filename, encoding='utf-8') as f:
|
187 |
+
filepaths_and_text = [line.strip().split(split) for line in f]
|
188 |
+
return filepaths_and_text
|
189 |
+
|
190 |
+
|
191 |
+
def get_hparams(init=True):
|
192 |
+
parser = argparse.ArgumentParser()
|
193 |
+
parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
|
194 |
+
help='JSON file for configuration')
|
195 |
+
parser.add_argument('-m', '--model', type=str, required=True,
|
196 |
+
help='Model name')
|
197 |
+
|
198 |
+
args = parser.parse_args()
|
199 |
+
model_dir = os.path.join("./logs", args.model)
|
200 |
+
|
201 |
+
if not os.path.exists(model_dir):
|
202 |
+
os.makedirs(model_dir)
|
203 |
+
|
204 |
+
config_path = args.config
|
205 |
+
config_save_path = os.path.join(model_dir, "config.json")
|
206 |
+
if init:
|
207 |
+
with open(config_path, "r") as f:
|
208 |
+
data = f.read()
|
209 |
+
with open(config_save_path, "w") as f:
|
210 |
+
f.write(data)
|
211 |
+
else:
|
212 |
+
with open(config_save_path, "r") as f:
|
213 |
+
data = f.read()
|
214 |
+
config = json.loads(data)
|
215 |
+
|
216 |
+
hparams = HParams(**config)
|
217 |
+
hparams.model_dir = model_dir
|
218 |
+
return hparams
|
219 |
+
|
220 |
+
|
221 |
+
def get_hparams_from_dir(model_dir):
|
222 |
+
config_save_path = os.path.join(model_dir, "config.json")
|
223 |
+
with open(config_save_path, "r") as f:
|
224 |
+
data = f.read()
|
225 |
+
config = json.loads(data)
|
226 |
+
|
227 |
+
hparams =HParams(**config)
|
228 |
+
hparams.model_dir = model_dir
|
229 |
+
return hparams
|
230 |
+
|
231 |
+
|
232 |
+
def get_hparams_from_file(config_path):
|
233 |
+
with open(config_path, "r") as f:
|
234 |
+
data = f.read()
|
235 |
+
config = json.loads(data)
|
236 |
+
|
237 |
+
hparams =HParams(**config)
|
238 |
+
return hparams
|
239 |
+
|
240 |
+
|
241 |
+
def check_git_hash(model_dir):
|
242 |
+
source_dir = os.path.dirname(os.path.realpath(__file__))
|
243 |
+
if not os.path.exists(os.path.join(source_dir, ".git")):
|
244 |
+
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
|
245 |
+
source_dir
|
246 |
+
))
|
247 |
+
return
|
248 |
+
|
249 |
+
cur_hash = subprocess.getoutput("git rev-parse HEAD")
|
250 |
+
|
251 |
+
path = os.path.join(model_dir, "githash")
|
252 |
+
if os.path.exists(path):
|
253 |
+
saved_hash = open(path).read()
|
254 |
+
if saved_hash != cur_hash:
|
255 |
+
logger.warn("git hash values are different. {}(saved) != {}(current)".format(
|
256 |
+
saved_hash[:8], cur_hash[:8]))
|
257 |
+
else:
|
258 |
+
open(path, "w").write(cur_hash)
|
259 |
+
|
260 |
+
|
261 |
+
def get_logger(model_dir, filename="train.log"):
|
262 |
+
global logger
|
263 |
+
logger = logging.getLogger(os.path.basename(model_dir))
|
264 |
+
logger.setLevel(logging.DEBUG)
|
265 |
+
|
266 |
+
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
|
267 |
+
if not os.path.exists(model_dir):
|
268 |
+
os.makedirs(model_dir)
|
269 |
+
h = logging.FileHandler(os.path.join(model_dir, filename))
|
270 |
+
h.setLevel(logging.DEBUG)
|
271 |
+
h.setFormatter(formatter)
|
272 |
+
logger.addHandler(h)
|
273 |
+
return logger
|
274 |
+
|
275 |
+
|
276 |
+
class HParams():
|
277 |
+
def __init__(self, **kwargs):
|
278 |
+
for k, v in kwargs.items():
|
279 |
+
if type(v) == dict:
|
280 |
+
v = HParams(**v)
|
281 |
+
self[k] = v
|
282 |
+
|
283 |
+
def keys(self):
|
284 |
+
return self.__dict__.keys()
|
285 |
+
|
286 |
+
def items(self):
|
287 |
+
return self.__dict__.items()
|
288 |
+
|
289 |
+
def values(self):
|
290 |
+
return self.__dict__.values()
|
291 |
+
|
292 |
+
def __len__(self):
|
293 |
+
return len(self.__dict__)
|
294 |
+
|
295 |
+
def __getitem__(self, key):
|
296 |
+
return getattr(self, key)
|
297 |
+
|
298 |
+
def __setitem__(self, key, value):
|
299 |
+
return setattr(self, key, value)
|
300 |
+
|
301 |
+
def __contains__(self, key):
|
302 |
+
return key in self.__dict__
|
303 |
+
|
304 |
+
def __repr__(self):
|
305 |
+
return self.__dict__.__repr__()
|