kevinwang676 commited on
Commit
1f74c86
1 Parent(s): dd6f15d

Upload 6 files

Browse files
Files changed (6) hide show
  1. commons.py +171 -0
  2. mel_processing.py +112 -0
  3. models.py +351 -0
  4. modules.py +342 -0
  5. tts_voice.py +290 -0
  6. utils.py +305 -0
commons.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size, dilation=1):
15
+ return int((kernel_size*dilation - dilation)/2)
16
+
17
+
18
+ def convert_pad_shape(pad_shape):
19
+ l = pad_shape[::-1]
20
+ pad_shape = [item for sublist in l for item in sublist]
21
+ return pad_shape
22
+
23
+
24
+ def intersperse(lst, item):
25
+ result = [item] * (len(lst) * 2 + 1)
26
+ result[1::2] = lst
27
+ return result
28
+
29
+
30
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
31
+ """KL(P||Q)"""
32
+ kl = (logs_q - logs_p) - 0.5
33
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ ret = torch.zeros_like(x[:, :, :segment_size])
50
+ for i in range(x.size(0)):
51
+ idx_str = ids_str[i]
52
+ idx_end = idx_str + segment_size
53
+ ret[i] = x[i, :, idx_str:idx_end]
54
+ return ret
55
+
56
+
57
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
+ b, d, t = x.size()
59
+ if x_lengths is None:
60
+ x_lengths = t
61
+ ids_str_max = x_lengths - segment_size + 1
62
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
+ ret = slice_segments(x, ids_str, segment_size)
64
+ return ret, ids_str
65
+
66
+
67
+ def rand_spec_segments(x, x_lengths=None, segment_size=4):
68
+ b, d, t = x.size()
69
+ if x_lengths is None:
70
+ x_lengths = t
71
+ ids_str_max = x_lengths - segment_size
72
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
73
+ ret = slice_segments(x, ids_str, segment_size)
74
+ return ret, ids_str
75
+
76
+
77
+ def get_timing_signal_1d(
78
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
79
+ position = torch.arange(length, dtype=torch.float)
80
+ num_timescales = channels // 2
81
+ log_timescale_increment = (
82
+ math.log(float(max_timescale) / float(min_timescale)) /
83
+ (num_timescales - 1))
84
+ inv_timescales = min_timescale * torch.exp(
85
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
86
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
87
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
88
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
89
+ signal = signal.view(1, channels, length)
90
+ return signal
91
+
92
+
93
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
94
+ b, channels, length = x.size()
95
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
96
+ return x + signal.to(dtype=x.dtype, device=x.device)
97
+
98
+
99
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
100
+ b, channels, length = x.size()
101
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
102
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
103
+
104
+
105
+ def subsequent_mask(length):
106
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
107
+ return mask
108
+
109
+
110
+ @torch.jit.script
111
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
112
+ n_channels_int = n_channels[0]
113
+ in_act = input_a + input_b
114
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
115
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
116
+ acts = t_act * s_act
117
+ return acts
118
+
119
+
120
+ def convert_pad_shape(pad_shape):
121
+ l = pad_shape[::-1]
122
+ pad_shape = [item for sublist in l for item in sublist]
123
+ return pad_shape
124
+
125
+
126
+ def shift_1d(x):
127
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
128
+ return x
129
+
130
+
131
+ def sequence_mask(length, max_length=None):
132
+ if max_length is None:
133
+ max_length = length.max()
134
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
135
+ return x.unsqueeze(0) < length.unsqueeze(1)
136
+
137
+
138
+ def generate_path(duration, mask):
139
+ """
140
+ duration: [b, 1, t_x]
141
+ mask: [b, 1, t_y, t_x]
142
+ """
143
+ device = duration.device
144
+
145
+ b, _, t_y, t_x = mask.shape
146
+ cum_duration = torch.cumsum(duration, -1)
147
+
148
+ cum_duration_flat = cum_duration.view(b * t_x)
149
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
150
+ path = path.view(b, t_x, t_y)
151
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
152
+ path = path.unsqueeze(1).transpose(2,3) * mask
153
+ return path
154
+
155
+
156
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
157
+ if isinstance(parameters, torch.Tensor):
158
+ parameters = [parameters]
159
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
160
+ norm_type = float(norm_type)
161
+ if clip_value is not None:
162
+ clip_value = float(clip_value)
163
+
164
+ total_norm = 0
165
+ for p in parameters:
166
+ param_norm = p.grad.data.norm(norm_type)
167
+ total_norm += param_norm.item() ** norm_type
168
+ if clip_value is not None:
169
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
170
+ total_norm = total_norm ** (1. / norm_type)
171
+ return total_norm
mel_processing.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import torch
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+ import torch.utils.data
8
+ import numpy as np
9
+ import librosa
10
+ import librosa.util as librosa_util
11
+ from librosa.util import normalize, pad_center, tiny
12
+ from scipy.signal import get_window
13
+ from scipy.io.wavfile import read
14
+ from librosa.filters import mel as librosa_mel_fn
15
+
16
+ MAX_WAV_VALUE = 32768.0
17
+
18
+
19
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
+ """
21
+ PARAMS
22
+ ------
23
+ C: compression factor
24
+ """
25
+ return torch.log(torch.clamp(x, min=clip_val) * C)
26
+
27
+
28
+ def dynamic_range_decompression_torch(x, C=1):
29
+ """
30
+ PARAMS
31
+ ------
32
+ C: compression factor used to compress
33
+ """
34
+ return torch.exp(x) / C
35
+
36
+
37
+ def spectral_normalize_torch(magnitudes):
38
+ output = dynamic_range_compression_torch(magnitudes)
39
+ return output
40
+
41
+
42
+ def spectral_de_normalize_torch(magnitudes):
43
+ output = dynamic_range_decompression_torch(magnitudes)
44
+ return output
45
+
46
+
47
+ mel_basis = {}
48
+ hann_window = {}
49
+
50
+
51
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52
+ if torch.min(y) < -1.:
53
+ print('min value is ', torch.min(y))
54
+ if torch.max(y) > 1.:
55
+ print('max value is ', torch.max(y))
56
+
57
+ global hann_window
58
+ dtype_device = str(y.dtype) + '_' + str(y.device)
59
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
60
+ if wnsize_dtype_device not in hann_window:
61
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62
+
63
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64
+ y = y.squeeze(1)
65
+
66
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
68
+
69
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70
+ return spec
71
+
72
+
73
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74
+ global mel_basis
75
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
76
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
77
+ if fmax_dtype_device not in mel_basis:
78
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
79
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81
+ spec = spectral_normalize_torch(spec)
82
+ return spec
83
+
84
+
85
+ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86
+ if torch.min(y) < -1.:
87
+ print('min value is ', torch.min(y))
88
+ if torch.max(y) > 1.:
89
+ print('max value is ', torch.max(y))
90
+
91
+ global mel_basis, hann_window
92
+ dtype_device = str(y.dtype) + '_' + str(y.device)
93
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
94
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
95
+ if fmax_dtype_device not in mel_basis:
96
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
97
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98
+ if wnsize_dtype_device not in hann_window:
99
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100
+
101
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102
+ y = y.squeeze(1)
103
+
104
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
106
+
107
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108
+
109
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
+ spec = spectral_normalize_torch(spec)
111
+
112
+ return spec
models.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ import commons
8
+ import modules
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from commons import init_weights, get_padding
13
+
14
+
15
+ class ResidualCouplingBlock(nn.Module):
16
+ def __init__(self,
17
+ channels,
18
+ hidden_channels,
19
+ kernel_size,
20
+ dilation_rate,
21
+ n_layers,
22
+ n_flows=4,
23
+ gin_channels=0):
24
+ super().__init__()
25
+ self.channels = channels
26
+ self.hidden_channels = hidden_channels
27
+ self.kernel_size = kernel_size
28
+ self.dilation_rate = dilation_rate
29
+ self.n_layers = n_layers
30
+ self.n_flows = n_flows
31
+ self.gin_channels = gin_channels
32
+
33
+ self.flows = nn.ModuleList()
34
+ for i in range(n_flows):
35
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
36
+ self.flows.append(modules.Flip())
37
+
38
+ def forward(self, x, x_mask, g=None, reverse=False):
39
+ if not reverse:
40
+ for flow in self.flows:
41
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
42
+ else:
43
+ for flow in reversed(self.flows):
44
+ x = flow(x, x_mask, g=g, reverse=reverse)
45
+ return x
46
+
47
+
48
+ class Encoder(nn.Module):
49
+ def __init__(self,
50
+ in_channels,
51
+ out_channels,
52
+ hidden_channels,
53
+ kernel_size,
54
+ dilation_rate,
55
+ n_layers,
56
+ gin_channels=0):
57
+ super().__init__()
58
+ self.in_channels = in_channels
59
+ self.out_channels = out_channels
60
+ self.hidden_channels = hidden_channels
61
+ self.kernel_size = kernel_size
62
+ self.dilation_rate = dilation_rate
63
+ self.n_layers = n_layers
64
+ self.gin_channels = gin_channels
65
+
66
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
67
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
68
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
69
+
70
+ def forward(self, x, x_lengths, g=None):
71
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
72
+ x = self.pre(x) * x_mask
73
+ x = self.enc(x, x_mask, g=g)
74
+ stats = self.proj(x) * x_mask
75
+ m, logs = torch.split(stats, self.out_channels, dim=1)
76
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
77
+ return z, m, logs, x_mask
78
+
79
+
80
+ class Generator(torch.nn.Module):
81
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
82
+ super(Generator, self).__init__()
83
+ self.num_kernels = len(resblock_kernel_sizes)
84
+ self.num_upsamples = len(upsample_rates)
85
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
86
+ resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
87
+
88
+ self.ups = nn.ModuleList()
89
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
90
+ self.ups.append(weight_norm(
91
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
92
+ k, u, padding=(k-u)//2)))
93
+
94
+ self.resblocks = nn.ModuleList()
95
+ for i in range(len(self.ups)):
96
+ ch = upsample_initial_channel//(2**(i+1))
97
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
98
+ self.resblocks.append(resblock(ch, k, d))
99
+
100
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
101
+ self.ups.apply(init_weights)
102
+
103
+ if gin_channels != 0:
104
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
105
+
106
+ def forward(self, x, g=None):
107
+ x = self.conv_pre(x)
108
+ if g is not None:
109
+ x = x + self.cond(g)
110
+
111
+ for i in range(self.num_upsamples):
112
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
113
+ x = self.ups[i](x)
114
+ xs = None
115
+ for j in range(self.num_kernels):
116
+ if xs is None:
117
+ xs = self.resblocks[i*self.num_kernels+j](x)
118
+ else:
119
+ xs += self.resblocks[i*self.num_kernels+j](x)
120
+ x = xs / self.num_kernels
121
+ x = F.leaky_relu(x)
122
+ x = self.conv_post(x)
123
+ x = torch.tanh(x)
124
+
125
+ return x
126
+
127
+ def remove_weight_norm(self):
128
+ print('Removing weight norm...')
129
+ for l in self.ups:
130
+ remove_weight_norm(l)
131
+ for l in self.resblocks:
132
+ l.remove_weight_norm()
133
+
134
+
135
+ class DiscriminatorP(torch.nn.Module):
136
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
137
+ super(DiscriminatorP, self).__init__()
138
+ self.period = period
139
+ self.use_spectral_norm = use_spectral_norm
140
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
141
+ self.convs = nn.ModuleList([
142
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
143
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
144
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
145
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
146
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
147
+ ])
148
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
149
+
150
+ def forward(self, x):
151
+ fmap = []
152
+
153
+ # 1d to 2d
154
+ b, c, t = x.shape
155
+ if t % self.period != 0: # pad first
156
+ n_pad = self.period - (t % self.period)
157
+ x = F.pad(x, (0, n_pad), "reflect")
158
+ t = t + n_pad
159
+ x = x.view(b, c, t // self.period, self.period)
160
+
161
+ for l in self.convs:
162
+ x = l(x)
163
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
164
+ fmap.append(x)
165
+ x = self.conv_post(x)
166
+ fmap.append(x)
167
+ x = torch.flatten(x, 1, -1)
168
+
169
+ return x, fmap
170
+
171
+
172
+ class DiscriminatorS(torch.nn.Module):
173
+ def __init__(self, use_spectral_norm=False):
174
+ super(DiscriminatorS, self).__init__()
175
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
176
+ self.convs = nn.ModuleList([
177
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
178
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
179
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
180
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
181
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
182
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
183
+ ])
184
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
185
+
186
+ def forward(self, x):
187
+ fmap = []
188
+
189
+ for l in self.convs:
190
+ x = l(x)
191
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
192
+ fmap.append(x)
193
+ x = self.conv_post(x)
194
+ fmap.append(x)
195
+ x = torch.flatten(x, 1, -1)
196
+
197
+ return x, fmap
198
+
199
+
200
+ class MultiPeriodDiscriminator(torch.nn.Module):
201
+ def __init__(self, use_spectral_norm=False):
202
+ super(MultiPeriodDiscriminator, self).__init__()
203
+ periods = [2,3,5,7,11]
204
+
205
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
206
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
207
+ self.discriminators = nn.ModuleList(discs)
208
+
209
+ def forward(self, y, y_hat):
210
+ y_d_rs = []
211
+ y_d_gs = []
212
+ fmap_rs = []
213
+ fmap_gs = []
214
+ for i, d in enumerate(self.discriminators):
215
+ y_d_r, fmap_r = d(y)
216
+ y_d_g, fmap_g = d(y_hat)
217
+ y_d_rs.append(y_d_r)
218
+ y_d_gs.append(y_d_g)
219
+ fmap_rs.append(fmap_r)
220
+ fmap_gs.append(fmap_g)
221
+
222
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
223
+
224
+
225
+ class SpeakerEncoder(torch.nn.Module):
226
+ def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
227
+ super(SpeakerEncoder, self).__init__()
228
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
229
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
230
+ self.relu = nn.ReLU()
231
+
232
+ def forward(self, mels):
233
+ self.lstm.flatten_parameters()
234
+ _, (hidden, _) = self.lstm(mels)
235
+ embeds_raw = self.relu(self.linear(hidden[-1]))
236
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
237
+
238
+ def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
239
+ mel_slices = []
240
+ for i in range(0, total_frames-partial_frames, partial_hop):
241
+ mel_range = torch.arange(i, i+partial_frames)
242
+ mel_slices.append(mel_range)
243
+
244
+ return mel_slices
245
+
246
+ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
247
+ mel_len = mel.size(1)
248
+ last_mel = mel[:,-partial_frames:]
249
+
250
+ if mel_len > partial_frames:
251
+ mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
252
+ mels = list(mel[:,s] for s in mel_slices)
253
+ mels.append(last_mel)
254
+ mels = torch.stack(tuple(mels), 0).squeeze(1)
255
+
256
+ with torch.no_grad():
257
+ partial_embeds = self(mels)
258
+ embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
259
+ #embed = embed / torch.linalg.norm(embed, 2)
260
+ else:
261
+ with torch.no_grad():
262
+ embed = self(last_mel)
263
+
264
+ return embed
265
+
266
+
267
+ class SynthesizerTrn(nn.Module):
268
+ """
269
+ Synthesizer for Training
270
+ """
271
+
272
+ def __init__(self,
273
+ spec_channels,
274
+ segment_size,
275
+ inter_channels,
276
+ hidden_channels,
277
+ filter_channels,
278
+ n_heads,
279
+ n_layers,
280
+ kernel_size,
281
+ p_dropout,
282
+ resblock,
283
+ resblock_kernel_sizes,
284
+ resblock_dilation_sizes,
285
+ upsample_rates,
286
+ upsample_initial_channel,
287
+ upsample_kernel_sizes,
288
+ gin_channels,
289
+ ssl_dim,
290
+ use_spk,
291
+ **kwargs):
292
+
293
+ super().__init__()
294
+ self.spec_channels = spec_channels
295
+ self.inter_channels = inter_channels
296
+ self.hidden_channels = hidden_channels
297
+ self.filter_channels = filter_channels
298
+ self.n_heads = n_heads
299
+ self.n_layers = n_layers
300
+ self.kernel_size = kernel_size
301
+ self.p_dropout = p_dropout
302
+ self.resblock = resblock
303
+ self.resblock_kernel_sizes = resblock_kernel_sizes
304
+ self.resblock_dilation_sizes = resblock_dilation_sizes
305
+ self.upsample_rates = upsample_rates
306
+ self.upsample_initial_channel = upsample_initial_channel
307
+ self.upsample_kernel_sizes = upsample_kernel_sizes
308
+ self.segment_size = segment_size
309
+ self.gin_channels = gin_channels
310
+ self.ssl_dim = ssl_dim
311
+ self.use_spk = use_spk
312
+
313
+ self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
314
+ self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
315
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
316
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
317
+
318
+ if not self.use_spk:
319
+ self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
320
+
321
+ def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
322
+ if c_lengths == None:
323
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
324
+ if spec_lengths == None:
325
+ spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
326
+
327
+ if not self.use_spk:
328
+ g = self.enc_spk(mel.transpose(1,2))
329
+ g = g.unsqueeze(-1)
330
+
331
+ _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
332
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
333
+ z_p = self.flow(z, spec_mask, g=g)
334
+
335
+ z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
336
+ o = self.dec(z_slice, g=g)
337
+
338
+ return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
339
+
340
+ def infer(self, c, g=None, mel=None, c_lengths=None):
341
+ if c_lengths == None:
342
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
343
+ if not self.use_spk:
344
+ g = self.enc_spk.embed_utterance(mel.transpose(1,2))
345
+ g = g.unsqueeze(-1)
346
+
347
+ z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
348
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
349
+ o = self.dec(z * c_mask, g=g)
350
+
351
+ return o
modules.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ import commons
13
+ from commons import init_weights, get_padding
14
+
15
+
16
+ LRELU_SLOPE = 0.1
17
+
18
+
19
+ class LayerNorm(nn.Module):
20
+ def __init__(self, channels, eps=1e-5):
21
+ super().__init__()
22
+ self.channels = channels
23
+ self.eps = eps
24
+
25
+ self.gamma = nn.Parameter(torch.ones(channels))
26
+ self.beta = nn.Parameter(torch.zeros(channels))
27
+
28
+ def forward(self, x):
29
+ x = x.transpose(1, -1)
30
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
31
+ return x.transpose(1, -1)
32
+
33
+
34
+ class ConvReluNorm(nn.Module):
35
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
36
+ super().__init__()
37
+ self.in_channels = in_channels
38
+ self.hidden_channels = hidden_channels
39
+ self.out_channels = out_channels
40
+ self.kernel_size = kernel_size
41
+ self.n_layers = n_layers
42
+ self.p_dropout = p_dropout
43
+ assert n_layers > 1, "Number of layers should be larger than 0."
44
+
45
+ self.conv_layers = nn.ModuleList()
46
+ self.norm_layers = nn.ModuleList()
47
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
48
+ self.norm_layers.append(LayerNorm(hidden_channels))
49
+ self.relu_drop = nn.Sequential(
50
+ nn.ReLU(),
51
+ nn.Dropout(p_dropout))
52
+ for _ in range(n_layers-1):
53
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
54
+ self.norm_layers.append(LayerNorm(hidden_channels))
55
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
56
+ self.proj.weight.data.zero_()
57
+ self.proj.bias.data.zero_()
58
+
59
+ def forward(self, x, x_mask):
60
+ x_org = x
61
+ for i in range(self.n_layers):
62
+ x = self.conv_layers[i](x * x_mask)
63
+ x = self.norm_layers[i](x)
64
+ x = self.relu_drop(x)
65
+ x = x_org + self.proj(x)
66
+ return x * x_mask
67
+
68
+
69
+ class DDSConv(nn.Module):
70
+ """
71
+ Dialted and Depth-Separable Convolution
72
+ """
73
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
74
+ super().__init__()
75
+ self.channels = channels
76
+ self.kernel_size = kernel_size
77
+ self.n_layers = n_layers
78
+ self.p_dropout = p_dropout
79
+
80
+ self.drop = nn.Dropout(p_dropout)
81
+ self.convs_sep = nn.ModuleList()
82
+ self.convs_1x1 = nn.ModuleList()
83
+ self.norms_1 = nn.ModuleList()
84
+ self.norms_2 = nn.ModuleList()
85
+ for i in range(n_layers):
86
+ dilation = kernel_size ** i
87
+ padding = (kernel_size * dilation - dilation) // 2
88
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
89
+ groups=channels, dilation=dilation, padding=padding
90
+ ))
91
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
92
+ self.norms_1.append(LayerNorm(channels))
93
+ self.norms_2.append(LayerNorm(channels))
94
+
95
+ def forward(self, x, x_mask, g=None):
96
+ if g is not None:
97
+ x = x + g
98
+ for i in range(self.n_layers):
99
+ y = self.convs_sep[i](x * x_mask)
100
+ y = self.norms_1[i](y)
101
+ y = F.gelu(y)
102
+ y = self.convs_1x1[i](y)
103
+ y = self.norms_2[i](y)
104
+ y = F.gelu(y)
105
+ y = self.drop(y)
106
+ x = x + y
107
+ return x * x_mask
108
+
109
+
110
+ class WN(torch.nn.Module):
111
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
112
+ super(WN, self).__init__()
113
+ assert(kernel_size % 2 == 1)
114
+ self.hidden_channels =hidden_channels
115
+ self.kernel_size = kernel_size,
116
+ self.dilation_rate = dilation_rate
117
+ self.n_layers = n_layers
118
+ self.gin_channels = gin_channels
119
+ self.p_dropout = p_dropout
120
+
121
+ self.in_layers = torch.nn.ModuleList()
122
+ self.res_skip_layers = torch.nn.ModuleList()
123
+ self.drop = nn.Dropout(p_dropout)
124
+
125
+ if gin_channels != 0:
126
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
127
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
128
+
129
+ for i in range(n_layers):
130
+ dilation = dilation_rate ** i
131
+ padding = int((kernel_size * dilation - dilation) / 2)
132
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
133
+ dilation=dilation, padding=padding)
134
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
135
+ self.in_layers.append(in_layer)
136
+
137
+ # last one is not necessary
138
+ if i < n_layers - 1:
139
+ res_skip_channels = 2 * hidden_channels
140
+ else:
141
+ res_skip_channels = hidden_channels
142
+
143
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
144
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
145
+ self.res_skip_layers.append(res_skip_layer)
146
+
147
+ def forward(self, x, x_mask, g=None, **kwargs):
148
+ output = torch.zeros_like(x)
149
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
150
+
151
+ if g is not None:
152
+ g = self.cond_layer(g)
153
+
154
+ for i in range(self.n_layers):
155
+ x_in = self.in_layers[i](x)
156
+ if g is not None:
157
+ cond_offset = i * 2 * self.hidden_channels
158
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
159
+ else:
160
+ g_l = torch.zeros_like(x_in)
161
+
162
+ acts = commons.fused_add_tanh_sigmoid_multiply(
163
+ x_in,
164
+ g_l,
165
+ n_channels_tensor)
166
+ acts = self.drop(acts)
167
+
168
+ res_skip_acts = self.res_skip_layers[i](acts)
169
+ if i < self.n_layers - 1:
170
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
171
+ x = (x + res_acts) * x_mask
172
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
173
+ else:
174
+ output = output + res_skip_acts
175
+ return output * x_mask
176
+
177
+ def remove_weight_norm(self):
178
+ if self.gin_channels != 0:
179
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
180
+ for l in self.in_layers:
181
+ torch.nn.utils.remove_weight_norm(l)
182
+ for l in self.res_skip_layers:
183
+ torch.nn.utils.remove_weight_norm(l)
184
+
185
+
186
+ class ResBlock1(torch.nn.Module):
187
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
188
+ super(ResBlock1, self).__init__()
189
+ self.convs1 = nn.ModuleList([
190
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
191
+ padding=get_padding(kernel_size, dilation[0]))),
192
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
193
+ padding=get_padding(kernel_size, dilation[1]))),
194
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
195
+ padding=get_padding(kernel_size, dilation[2])))
196
+ ])
197
+ self.convs1.apply(init_weights)
198
+
199
+ self.convs2 = nn.ModuleList([
200
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201
+ padding=get_padding(kernel_size, 1))),
202
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203
+ padding=get_padding(kernel_size, 1))),
204
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
205
+ padding=get_padding(kernel_size, 1)))
206
+ ])
207
+ self.convs2.apply(init_weights)
208
+
209
+ def forward(self, x, x_mask=None):
210
+ for c1, c2 in zip(self.convs1, self.convs2):
211
+ xt = F.leaky_relu(x, LRELU_SLOPE)
212
+ if x_mask is not None:
213
+ xt = xt * x_mask
214
+ xt = c1(xt)
215
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
216
+ if x_mask is not None:
217
+ xt = xt * x_mask
218
+ xt = c2(xt)
219
+ x = xt + x
220
+ if x_mask is not None:
221
+ x = x * x_mask
222
+ return x
223
+
224
+ def remove_weight_norm(self):
225
+ for l in self.convs1:
226
+ remove_weight_norm(l)
227
+ for l in self.convs2:
228
+ remove_weight_norm(l)
229
+
230
+
231
+ class ResBlock2(torch.nn.Module):
232
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
233
+ super(ResBlock2, self).__init__()
234
+ self.convs = nn.ModuleList([
235
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
236
+ padding=get_padding(kernel_size, dilation[0]))),
237
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
238
+ padding=get_padding(kernel_size, dilation[1])))
239
+ ])
240
+ self.convs.apply(init_weights)
241
+
242
+ def forward(self, x, x_mask=None):
243
+ for c in self.convs:
244
+ xt = F.leaky_relu(x, LRELU_SLOPE)
245
+ if x_mask is not None:
246
+ xt = xt * x_mask
247
+ xt = c(xt)
248
+ x = xt + x
249
+ if x_mask is not None:
250
+ x = x * x_mask
251
+ return x
252
+
253
+ def remove_weight_norm(self):
254
+ for l in self.convs:
255
+ remove_weight_norm(l)
256
+
257
+
258
+ class Log(nn.Module):
259
+ def forward(self, x, x_mask, reverse=False, **kwargs):
260
+ if not reverse:
261
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
262
+ logdet = torch.sum(-y, [1, 2])
263
+ return y, logdet
264
+ else:
265
+ x = torch.exp(x) * x_mask
266
+ return x
267
+
268
+
269
+ class Flip(nn.Module):
270
+ def forward(self, x, *args, reverse=False, **kwargs):
271
+ x = torch.flip(x, [1])
272
+ if not reverse:
273
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
274
+ return x, logdet
275
+ else:
276
+ return x
277
+
278
+
279
+ class ElementwiseAffine(nn.Module):
280
+ def __init__(self, channels):
281
+ super().__init__()
282
+ self.channels = channels
283
+ self.m = nn.Parameter(torch.zeros(channels,1))
284
+ self.logs = nn.Parameter(torch.zeros(channels,1))
285
+
286
+ def forward(self, x, x_mask, reverse=False, **kwargs):
287
+ if not reverse:
288
+ y = self.m + torch.exp(self.logs) * x
289
+ y = y * x_mask
290
+ logdet = torch.sum(self.logs * x_mask, [1,2])
291
+ return y, logdet
292
+ else:
293
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
294
+ return x
295
+
296
+
297
+ class ResidualCouplingLayer(nn.Module):
298
+ def __init__(self,
299
+ channels,
300
+ hidden_channels,
301
+ kernel_size,
302
+ dilation_rate,
303
+ n_layers,
304
+ p_dropout=0,
305
+ gin_channels=0,
306
+ mean_only=False):
307
+ assert channels % 2 == 0, "channels should be divisible by 2"
308
+ super().__init__()
309
+ self.channels = channels
310
+ self.hidden_channels = hidden_channels
311
+ self.kernel_size = kernel_size
312
+ self.dilation_rate = dilation_rate
313
+ self.n_layers = n_layers
314
+ self.half_channels = channels // 2
315
+ self.mean_only = mean_only
316
+
317
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
318
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
319
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
320
+ self.post.weight.data.zero_()
321
+ self.post.bias.data.zero_()
322
+
323
+ def forward(self, x, x_mask, g=None, reverse=False):
324
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
325
+ h = self.pre(x0) * x_mask
326
+ h = self.enc(h, x_mask, g=g)
327
+ stats = self.post(h) * x_mask
328
+ if not self.mean_only:
329
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
330
+ else:
331
+ m = stats
332
+ logs = torch.zeros_like(m)
333
+
334
+ if not reverse:
335
+ x1 = m + x1 * torch.exp(logs) * x_mask
336
+ x = torch.cat([x0, x1], 1)
337
+ logdet = torch.sum(logs, [1,2])
338
+ return x, logdet
339
+ else:
340
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
341
+ x = torch.cat([x0, x1], 1)
342
+ return x
tts_voice.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tts_order_voice = {'英语 (美国)-Jenny-女': 'en-US-JennyNeural',
2
+ '英语 (美国)-Guy-男': 'en-US-GuyNeural',
3
+ '英语 (美国)-Ana-女': 'en-US-AnaNeural',
4
+ '英语 (美国)-Aria-女': 'en-US-AriaNeural',
5
+ '英语 (美国)-Christopher-男': 'en-US-ChristopherNeural',
6
+ '英语 (美国)-Eric-男': 'en-US-EricNeural',
7
+ '英语 (美国)-Michelle-女': 'en-US-MichelleNeural',
8
+ '英语 (美国)-Roger-男': 'en-US-RogerNeural',
9
+ '西班牙语 (墨西哥)-Dalia-女': 'es-MX-DaliaNeural',
10
+ '西班牙语 (墨西哥)-Jorge-男': 'es-MX-JorgeNeural',
11
+ '韩语 (韩国)-Sun-Hi-女': 'ko-KR-SunHiNeural',
12
+ '韩语 (韩国)-InJoon-男': 'ko-KR-InJoonNeural',
13
+ '泰语 (泰国)-Premwadee-女': 'th-TH-PremwadeeNeural',
14
+ '泰语 (泰国)-Niwat-男': 'th-TH-NiwatNeural',
15
+ '越南语 (越南)-HoaiMy-女': 'vi-VN-HoaiMyNeural',
16
+ '越南语 (越南)-NamMinh-男': 'vi-VN-NamMinhNeural',
17
+ '日语 (日本)-Nanami-女': 'ja-JP-NanamiNeural',
18
+ '日语 (日本)-Keita-男': 'ja-JP-KeitaNeural',
19
+ '法语 (法国)-Denise-女': 'fr-FR-DeniseNeural',
20
+ '法语 (法国)-Eloise-女': 'fr-FR-EloiseNeural',
21
+ '法语 (法国)-Henri-男': 'fr-FR-HenriNeural',
22
+ '葡萄牙语 (巴西)-Francisca-女': 'pt-BR-FranciscaNeural',
23
+ '葡萄牙语 (巴西)-Antonio-男': 'pt-BR-AntonioNeural',
24
+ '印度尼西亚语 (印度尼西亚)-Ardi-男': 'id-ID-ArdiNeural',
25
+ '印度尼西亚语 (印度尼西亚)-Gadis-女': 'id-ID-GadisNeural',
26
+ '希伯来语 (以色列)-Avri-男': 'he-IL-AvriNeural',
27
+ '希伯来语 (以色列)-Hila-女': 'he-IL-HilaNeural',
28
+ '意大利语 (意大利)-Isabella-女': 'it-IT-IsabellaNeural',
29
+ '意大利语 (意大利)-Diego-男': 'it-IT-DiegoNeural',
30
+ '意大利语 (意大利)-Elsa-女': 'it-IT-ElsaNeural',
31
+ '荷兰语 (荷兰)-Colette-女': 'nl-NL-ColetteNeural',
32
+ '荷兰语 (荷兰)-Fenna-女': 'nl-NL-FennaNeural',
33
+ '荷兰语 (荷兰)-Maarten-男': 'nl-NL-MaartenNeural',
34
+ '马来语 (马来西亚)-Osman-男': 'ms-MY-OsmanNeural',
35
+ '马来语 (马来西亚)-Yasmin-女': 'ms-MY-YasminNeural',
36
+ '挪威语 (挪威)-Pernille-女': 'nb-NO-PernilleNeural',
37
+ '挪威语 (挪威)-Finn-男': 'nb-NO-FinnNeural',
38
+ '瑞典语 (瑞典)-Sofie-女': 'sv-SE-SofieNeural',
39
+ '瑞典语 (瑞典)-Mattias-男': 'sv-SE-MattiasNeural',
40
+ '阿拉伯语 (沙特阿拉伯)-Hamed-男': 'ar-SA-HamedNeural',
41
+ '阿拉伯语 (沙特阿拉伯)-Zariyah-女': 'ar-SA-ZariyahNeural',
42
+ '希腊语 (希腊)-Athina-女': 'el-GR-AthinaNeural',
43
+ '希腊语 (希腊)-Nestoras-男': 'el-GR-NestorasNeural',
44
+ '德语 (德国)-Katja-女': 'de-DE-KatjaNeural',
45
+ '德语 (德国)-Amala-女': 'de-DE-AmalaNeural',
46
+ '德语 (德国)-Conrad-男': 'de-DE-ConradNeural',
47
+ '德语 (德国)-Killian-男': 'de-DE-KillianNeural',
48
+ '阿拉伯语 (南非)-Adri-女': 'af-ZA-AdriNeural',
49
+ '阿拉伯语 (南非)-Willem-男': 'af-ZA-WillemNeural',
50
+ '阿姆哈拉语 (埃塞俄比亚)-Ameha-男': 'am-ET-AmehaNeural',
51
+ '阿姆哈拉语 (埃塞俄比亚)-Mekdes-女': 'am-ET-MekdesNeural',
52
+ '阿拉伯语 (阿拉伯联合酋长国)-Fatima-女': 'ar-AE-FatimaNeural',
53
+ '阿拉伯语 (阿拉伯联合酋长国)-Hamdan-男': 'ar-AE-HamdanNeural',
54
+ '阿拉伯语 (巴林)-Ali-男': 'ar-BH-AliNeural',
55
+ '阿拉伯语 (巴林)-Laila-女': 'ar-BH-LailaNeural',
56
+ '阿拉伯语 (阿尔及利亚)-Ismael-男': 'ar-DZ-IsmaelNeural',
57
+ '阿拉伯语 (埃及)-Salma-女': 'ar-EG-SalmaNeural',
58
+ '阿拉伯语 (埃及)-Shakir-男': 'ar-EG-ShakirNeural',
59
+ '阿拉伯语 (伊拉克)-Bassel-男': 'ar-IQ-BasselNeural',
60
+ '阿拉伯语 (伊拉克)-Rana-女': 'ar-IQ-RanaNeural',
61
+ '阿拉伯语 (约旦)-Sana-女': 'ar-JO-SanaNeural',
62
+ '阿拉伯语 (约旦)-Taim-男': 'ar-JO-TaimNeural',
63
+ '阿拉伯语 (科威特)-Fahed-男': 'ar-KW-FahedNeural',
64
+ '阿拉伯语 (科威特)-Noura-女': 'ar-KW-NouraNeural',
65
+ '阿拉伯语 (黎巴嫩)-Layla-女': 'ar-LB-LaylaNeural',
66
+ '阿拉伯语 (黎巴嫩)-Rami-男': 'ar-LB-RamiNeural',
67
+ '阿拉伯语 (利比亚)-Iman-女': 'ar-LY-ImanNeural',
68
+ '阿拉伯语 (利比亚)-Omar-男': 'ar-LY-OmarNeural',
69
+ '阿拉伯语 (摩洛哥)-Jamal-男': 'ar-MA-JamalNeural',
70
+ '阿拉伯语 (摩洛哥)-Mouna-女': 'ar-MA-MounaNeural',
71
+ '阿拉伯语 (阿曼)-Abdullah-男': 'ar-OM-AbdullahNeural',
72
+ '阿拉伯语 (阿曼)-Aysha-女': 'ar-OM-AyshaNeural',
73
+ '阿拉伯语 (卡塔尔)-Amal-女': 'ar-QA-AmalNeural',
74
+ '阿拉伯语 (卡塔尔)-Moaz-男': 'ar-QA-MoazNeural',
75
+ '阿拉伯语 (叙利亚)-Amany-女': 'ar-SY-AmanyNeural',
76
+ '阿拉伯语 (叙利亚)-Laith-男': 'ar-SY-LaithNeural',
77
+ '阿拉伯语 (突尼斯)-Hedi-男': 'ar-TN-HediNeural',
78
+ '阿拉伯语 (突尼斯)-Reem-女': 'ar-TN-ReemNeural',
79
+ '阿拉伯语 (也门)-Maryam-女': 'ar-YE-MaryamNeural',
80
+ '阿拉伯语 (也门)-Saleh-男': 'ar-YE-SalehNeural',
81
+ '阿塞拜疆语 (阿塞拜疆)-Babek-男': 'az-AZ-BabekNeural',
82
+ '阿塞拜疆语 (阿塞拜疆)-Banu-女': 'az-AZ-BanuNeural',
83
+ '保加利亚语 (保加利亚)-Borislav-男': 'bg-BG-BorislavNeural',
84
+ '保加利亚语 (保加利亚)-Kalina-女': 'bg-BG-KalinaNeural',
85
+ '孟加拉语 (孟加拉国)-Nabanita-女': 'bn-BD-NabanitaNeural',
86
+ '孟加拉语 (孟加拉国)-Pradeep-男': 'bn-BD-PradeepNeural',
87
+ '孟加拉语 (印度)-Bashkar-男': 'bn-IN-BashkarNeural',
88
+ '孟加拉语 (印度)-Tanishaa-女': 'bn-IN-TanishaaNeural',
89
+ '波斯尼亚语 (波斯尼亚和黑塞哥维那)-Goran-男': 'bs-BA-GoranNeural',
90
+ '波斯尼亚语 (波斯尼亚和黑塞哥维那)-Vesna-女': 'bs-BA-VesnaNeural',
91
+ '加泰罗尼亚语 (西班牙)-Joana-女': 'ca-ES-JoanaNeural',
92
+ '加泰罗尼亚语 (西班牙)-Enric-男': 'ca-ES-EnricNeural',
93
+ '捷克语 (捷克共和国)-Antonin-男': 'cs-CZ-AntoninNeural',
94
+ '捷克语 (捷克共和国)-Vlasta-女': 'cs-CZ-VlastaNeural',
95
+ '威尔士语 (英国)-Aled-男': 'cy-GB-AledNeural',
96
+ '威尔士语 (英国)-Nia-女': 'cy-GB-NiaNeural',
97
+ '丹麦语 (丹麦)-Christel-女': 'da-DK-ChristelNeural',
98
+ '丹麦语 (丹麦)-Jeppe-男': 'da-DK-JeppeNeural',
99
+ '德语 (奥地利)-Ingrid-女': 'de-AT-IngridNeural',
100
+ '德语 (奥地利)-Jonas-男': 'de-AT-JonasNeural',
101
+ '德语 (瑞士)-Jan-男': 'de-CH-JanNeural',
102
+ '德语 (瑞士)-Leni-女': 'de-CH-LeniNeural',
103
+ '英语 (澳大利亚)-Natasha-女': 'en-AU-NatashaNeural',
104
+ '英语 (澳大利亚)-William-男': 'en-AU-WilliamNeural',
105
+ '英语 (加拿大)-Clara-女': 'en-CA-ClaraNeural',
106
+ '英语 (加拿大)-Liam-男': 'en-CA-LiamNeural',
107
+ '英语 (英国)-Libby-女': 'en-GB-LibbyNeural',
108
+ '英语 (英国)-Maisie-女': 'en-GB-MaisieNeural',
109
+ '英语 (英国)-Ryan-男': 'en-GB-RyanNeural',
110
+ '英语 (英国)-Sonia-女': 'en-GB-SoniaNeural',
111
+ '英语 (英国)-Thomas-男': 'en-GB-ThomasNeural',
112
+ '英语 (香港)-Sam-男': 'en-HK-SamNeural',
113
+ '英语 (香港)-Yan-女': 'en-HK-YanNeural',
114
+ '英语 (爱尔兰)-Connor-男': 'en-IE-ConnorNeural',
115
+ '英语 (爱尔兰)-Emily-女': 'en-IE-EmilyNeural',
116
+ '英语 (印度)-Neerja-女': 'en-IN-NeerjaNeural',
117
+ '英语 (印度)-Prabhat-男': 'en-IN-PrabhatNeural',
118
+ '英语 (肯尼亚)-Asilia-女': 'en-KE-AsiliaNeural',
119
+ '英语 (肯尼亚)-Chilemba-男': 'en-KE-ChilembaNeural',
120
+ '英语 (尼日利亚)-Abeo-男': 'en-NG-AbeoNeural',
121
+ '英语 (尼日利亚)-Ezinne-女': 'en-NG-EzinneNeural',
122
+ '英语 (新西兰)-Mitchell-男': 'en-NZ-MitchellNeural',
123
+ '英语 (菲律宾)-James-男': 'en-PH-JamesNeural',
124
+ '英语 (菲律宾)-Rosa-女': 'en-PH-RosaNeural',
125
+ '英语 (新加坡)-Luna-女': 'en-SG-LunaNeural',
126
+ '英语 (新加坡)-Wayne-男': 'en-SG-WayneNeural',
127
+ '英语 (坦桑尼亚)-Elimu-男': 'en-TZ-ElimuNeural',
128
+ '英语 (坦桑尼亚)-Imani-女': 'en-TZ-ImaniNeural',
129
+ '英语 (南非)-Leah-女': 'en-ZA-LeahNeural',
130
+ '英语 (南非)-Luke-男': 'en-ZA-LukeNeural',
131
+ '西班牙语 (阿根廷)-Elena-女': 'es-AR-ElenaNeural',
132
+ '西班牙语 (阿根廷)-Tomas-男': 'es-AR-TomasNeural',
133
+ '西班牙语 (玻利维亚)-Marcelo-男': 'es-BO-MarceloNeural',
134
+ '西班牙语 (玻利维亚)-Sofia-女': 'es-BO-SofiaNeural',
135
+ '西班牙语 (哥伦比亚)-Gonzalo-男': 'es-CO-GonzaloNeural',
136
+ '西班牙语 (哥伦比亚)-Salome-女': 'es-CO-SalomeNeural',
137
+ '西班牙语 (哥斯达黎加)-Juan-男': 'es-CR-JuanNeural',
138
+ '西班牙语 (哥斯达黎加)-Maria-女': 'es-CR-MariaNeural',
139
+ '西班牙语 (古巴)-Belkys-女': 'es-CU-BelkysNeural',
140
+ '西班牙语 (多米尼加共和国)-Emilio-男': 'es-DO-EmilioNeural',
141
+ '西班牙语 (多米尼加共和国)-Ramona-女': 'es-DO-RamonaNeural',
142
+ '西班牙语 (厄瓜多尔)-Andrea-女': 'es-EC-AndreaNeural',
143
+ '西班牙语 (厄瓜多尔)-Luis-男': 'es-EC-LuisNeural',
144
+ '西班牙语 (西班牙)-Alvaro-男': 'es-ES-AlvaroNeural',
145
+ '西班牙语 (西班牙)-Elvira-女': 'es-ES-ElviraNeural',
146
+ '西班牙语 (赤道几内亚)-Teresa-女': 'es-GQ-TeresaNeural',
147
+ '西班牙语 (危地马拉)-Andres-男': 'es-GT-AndresNeural',
148
+ '西班牙语 (危地马拉)-Marta-女': 'es-GT-MartaNeural',
149
+ '西班牙语 (洪都拉斯)-Carlos-男': 'es-HN-CarlosNeural',
150
+ '西班牙语 (洪都拉斯)-Karla-女': 'es-HN-KarlaNeural',
151
+ '西班牙语 (尼加拉瓜)-Federico-男': 'es-NI-FedericoNeural',
152
+ '西班牙语 (尼加拉瓜)-Yolanda-女': 'es-NI-YolandaNeural',
153
+ '西班牙语 (巴拿马)-Margarita-女': 'es-PA-MargaritaNeural',
154
+ '西班牙语 (巴拿马)-Roberto-男': 'es-PA-RobertoNeural',
155
+ '西班牙语 (秘鲁)-Alex-男': 'es-PE-AlexNeural',
156
+ '西班牙语 (秘鲁)-Camila-女': 'es-PE-CamilaNeural',
157
+ '西班牙语 (波多黎各)-Karina-女': 'es-PR-KarinaNeural',
158
+ '西班牙语 (波多黎各)-Victor-男': 'es-PR-VictorNeural',
159
+ '西班牙语 (巴拉圭)-Mario-男': 'es-PY-MarioNeural',
160
+ '西班牙语 (巴拉圭)-Tania-女': 'es-PY-TaniaNeural',
161
+ '西班牙语 (萨尔瓦多)-Lorena-女': 'es-SV-LorenaNeural',
162
+ '西班牙语 (萨尔瓦多)-Rodrigo-男': 'es-SV-RodrigoNeural',
163
+ '西班牙语 (美国)-Alonso-男': 'es-US-AlonsoNeural',
164
+ '西班牙语 (美国)-Paloma-女': 'es-US-PalomaNeural',
165
+ '西班牙语 (乌拉圭)-Mateo-男': 'es-UY-MateoNeural',
166
+ '西班牙语 (乌拉圭)-Valentina-女': 'es-UY-ValentinaNeural',
167
+ '西班牙语 (委内瑞拉)-Paola-女': 'es-VE-PaolaNeural',
168
+ '西班牙语 (委内瑞拉)-Sebastian-男': 'es-VE-SebastianNeural',
169
+ '爱沙尼亚语 (爱沙尼亚)-Anu-���': 'et-EE-AnuNeural',
170
+ '爱沙尼亚语 (爱沙尼亚)-Kert-男': 'et-EE-KertNeural',
171
+ '波斯语 (伊朗)-Dilara-女': 'fa-IR-DilaraNeural',
172
+ '波斯语 (伊朗)-Farid-男': 'fa-IR-FaridNeural',
173
+ '芬兰语 (芬兰)-Harri-男': 'fi-FI-HarriNeural',
174
+ '芬兰语 (芬兰)-Noora-女': 'fi-FI-NooraNeural',
175
+ '法语 (比利时)-Charline-女': 'fr-BE-CharlineNeural',
176
+ '法语 (比利时)-Gerard-男': 'fr-BE-GerardNeural',
177
+ '法语 (加拿大)-Sylvie-女': 'fr-CA-SylvieNeural',
178
+ '法语 (加拿大)-Antoine-男': 'fr-CA-AntoineNeural',
179
+ '法语 (加拿大)-Jean-男': 'fr-CA-JeanNeural',
180
+ '法语 (瑞士)-Ariane-女': 'fr-CH-ArianeNeural',
181
+ '法语 (瑞士)-Fabrice-男': 'fr-CH-FabriceNeural',
182
+ '爱尔兰语 (爱尔兰)-Colm-男': 'ga-IE-ColmNeural',
183
+ '爱尔兰语 (爱尔兰)-Orla-女': 'ga-IE-OrlaNeural',
184
+ '加利西亚语 (西班牙)-Roi-男': 'gl-ES-RoiNeural',
185
+ '加利西亚语 (西班牙)-Sabela-女': 'gl-ES-SabelaNeural',
186
+ '古吉拉特语 (印度)-Dhwani-女': 'gu-IN-DhwaniNeural',
187
+ '古吉拉特语 (印度)-Niranjan-男': 'gu-IN-NiranjanNeural',
188
+ '印地语 (印度)-Madhur-男': 'hi-IN-MadhurNeural',
189
+ '印地语 (印度)-Swara-女': 'hi-IN-SwaraNeural',
190
+ '克罗地亚语 (克罗地亚)-Gabrijela-女': 'hr-HR-GabrijelaNeural',
191
+ '克罗地亚语 (克罗地亚)-Srecko-男': 'hr-HR-SreckoNeural',
192
+ '匈牙利语 (匈牙利)-Noemi-女': 'hu-HU-NoemiNeural',
193
+ '匈牙利语 (匈牙利)-Tamas-男': 'hu-HU-TamasNeural',
194
+ '冰岛语 (冰岛)-Gudrun-女': 'is-IS-GudrunNeural',
195
+ '冰岛语 (冰岛)-Gunnar-男': 'is-IS-GunnarNeural',
196
+ '爪哇语 (印度尼西亚)-Dimas-男': 'jv-ID-DimasNeural',
197
+ '爪哇语 (印度尼西亚)-Siti-女': 'jv-ID-SitiNeural',
198
+ '格鲁吉亚语 (格鲁吉亚)-Eka-女': 'ka-GE-EkaNeural',
199
+ '格鲁吉亚语 (格鲁吉亚)-Giorgi-男': 'ka-GE-GiorgiNeural',
200
+ '哈萨克语 (哈萨克斯坦)-Aigul-女': 'kk-KZ-AigulNeural',
201
+ '哈萨克语 (哈萨克斯坦)-Daulet-男': 'kk-KZ-DauletNeural',
202
+ '高棉语 (柬埔寨)-Piseth-男': 'km-KH-PisethNeural',
203
+ '高棉语 (柬埔寨)-Sreymom-女': 'km-KH-SreymomNeural',
204
+ '卡纳达语 (印度)-Gagan-男': 'kn-IN-GaganNeural',
205
+ '卡纳达语 (印度)-Sapna-女': 'kn-IN-SapnaNeural',
206
+ '老挝语 (老挝)-Chanthavong-男': 'lo-LA-ChanthavongNeural',
207
+ '老挝语 (老挝)-Keomany-女': 'lo-LA-KeomanyNeural',
208
+ '立陶宛语 (立陶宛)-Leonas-男': 'lt-LT-LeonasNeural',
209
+ '立陶宛语 (立陶宛)-Ona-女': 'lt-LT-OnaNeural',
210
+ '拉脱维亚语 (拉脱维亚)-Everita-女': 'lv-LV-EveritaNeural',
211
+ '拉脱维亚语 (拉脱维亚)-Nils-男': 'lv-LV-NilsNeural',
212
+ '马其顿语 (北马其顿共和国)-Aleksandar-男': 'mk-MK-AleksandarNeural',
213
+ '马其顿语 (北马其顿共和国)-Marija-女': 'mk-MK-MarijaNeural',
214
+ '马拉雅拉姆语 (印度)-Midhun-男': 'ml-IN-MidhunNeural',
215
+ '马拉雅拉姆语 (印度)-Sobhana-女': 'ml-IN-SobhanaNeural',
216
+ '蒙古语 (蒙古)-Bataa-男': 'mn-MN-BataaNeural',
217
+ '蒙古语 (蒙古)-Yesui-女': 'mn-MN-YesuiNeural',
218
+ '马拉地语 (印度)-Aarohi-女': 'mr-IN-AarohiNeural',
219
+ '马拉地语 (印度)-Manohar-男': 'mr-IN-ManoharNeural',
220
+ '马耳他语 (马耳他)-Grace-女': 'mt-MT-GraceNeural',
221
+ '马耳他语 (马耳他)-Joseph-男': 'mt-MT-JosephNeural',
222
+ '缅甸语 (缅甸)-Nilar-女': 'my-MM-NilarNeural',
223
+ '缅甸语 (缅甸)-Thiha-男': 'my-MM-ThihaNeural',
224
+ '尼泊尔语 (尼泊尔)-Hemkala-女': 'ne-NP-HemkalaNeural',
225
+ '尼泊尔语 (尼泊尔)-Sagar-男': 'ne-NP-SagarNeural',
226
+ '荷兰语 (比利时)-Arnaud-男': 'nl-BE-ArnaudNeural',
227
+ '荷兰语 (比利时)-Dena-女': 'nl-BE-DenaNeural',
228
+ '波兰语 (波兰)-Marek-男': 'pl-PL-MarekNeural',
229
+ '波兰语 (波兰)-Zofia-女': 'pl-PL-ZofiaNeural',
230
+ '普什图语 (阿富汗)-Gul Nawaz-男': 'ps-AF-GulNawazNeural',
231
+ '普什图语 (阿富汗)-Latifa-女': 'ps-AF-LatifaNeural',
232
+ '葡萄牙语 (葡萄牙)-Duarte-男': 'pt-PT-DuarteNeural',
233
+ '葡萄牙语 (葡萄牙)-Raquel-女': 'pt-PT-RaquelNeural',
234
+ '罗马尼亚语 (罗马尼亚)-Alina-女': 'ro-RO-AlinaNeural',
235
+ '罗马尼亚语 (罗马尼亚)-Emil-男': 'ro-RO-EmilNeural',
236
+ '俄语 (俄罗斯)-Svetlana-女': 'ru-RU-SvetlanaNeural',
237
+ '俄语 (俄罗斯)-Dmitry-男': 'ru-RU-DmitryNeural',
238
+ '僧伽罗语 (斯里兰卡)-Sameera-男': 'si-LK-SameeraNeural',
239
+ '僧伽罗语 (斯里兰卡)-Thilini-女': 'si-LK-ThiliniNeural',
240
+ '斯洛伐克语 (斯洛伐克)-Lukas-男': 'sk-SK-LukasNeural',
241
+ '斯洛伐克语 (斯洛伐克)-Viktoria-女': 'sk-SK-ViktoriaNeural',
242
+ '斯洛文尼亚语 (斯洛文尼亚)-Petra-女': 'sl-SI-PetraNeural',
243
+ '斯洛文尼亚语 (斯洛文尼亚)-Rok-男': 'sl-SI-RokNeural',
244
+ '索马里语 (索马里)-Muuse-男': 'so-SO-MuuseNeural',
245
+ '索马里语 (索马里)-Ubax-女': 'so-SO-UbaxNeural',
246
+ '阿尔巴尼亚语 (阿尔巴尼亚)-Anila-女': 'sq-AL-AnilaNeural',
247
+ '阿尔巴尼亚语 (阿尔巴尼亚)-Ilir-男': 'sq-AL-IlirNeural',
248
+ '塞尔维亚语 (塞尔维亚)-Nicholas-男': 'sr-RS-NicholasNeural',
249
+ '塞尔维亚语 (塞尔维亚)-Sophie-女': 'sr-RS-SophieNeural',
250
+ '巽他语 (印度尼西亚)-Jajang-男': 'su-ID-JajangNeural',
251
+ '巽他语 (印度尼��亚)-Tuti-女': 'su-ID-TutiNeural',
252
+ '斯瓦希里语 (肯尼亚)-Rafiki-男': 'sw-KE-RafikiNeural',
253
+ '斯瓦希里语 (肯尼亚)-Zuri-女': 'sw-KE-ZuriNeural',
254
+ '斯瓦希里语 (坦桑尼亚)-Daudi-男': 'sw-TZ-DaudiNeural',
255
+ '斯瓦希里语 (坦桑尼亚)-Rehema-女': 'sw-TZ-RehemaNeural',
256
+ '泰米尔语 (印度)-Pallavi-女': 'ta-IN-PallaviNeural',
257
+ '泰米尔语 (印度)-Valluvar-男': 'ta-IN-ValluvarNeural',
258
+ '泰米尔语 (斯里兰卡)-Kumar-男': 'ta-LK-KumarNeural',
259
+ '泰米尔语 (斯里兰卡)-Saranya-女': 'ta-LK-SaranyaNeural',
260
+ '泰米尔语 (马来西亚)-Kani-女': 'ta-MY-KaniNeural',
261
+ '泰米尔语 (马来西亚)-Surya-男': 'ta-MY-SuryaNeural',
262
+ '泰米尔语 (新加坡)-Anbu-男': 'ta-SG-AnbuNeural',
263
+ '泰卢固语 (印度)-Mohan-男': 'te-IN-MohanNeural',
264
+ '泰卢固语 (印度)-Shruti-女': 'te-IN-ShrutiNeural',
265
+ '土耳其语 (土耳其)-Ahmet-男': 'tr-TR-AhmetNeural',
266
+ '土耳其语 (土耳其)-Emel-女': 'tr-TR-EmelNeural',
267
+ '乌克兰语 (乌克兰)-Ostap-男': 'uk-UA-OstapNeural',
268
+ '乌克兰语 (乌克兰)-Polina-女': 'uk-UA-PolinaNeural',
269
+ '乌尔都语 (印度)-Gul-女': 'ur-IN-GulNeural',
270
+ '乌尔都语 (印度)-Salman-男': 'ur-IN-SalmanNeural',
271
+ '乌尔都语 (巴基斯坦)-Asad-男': 'ur-PK-AsadNeural',
272
+ '乌尔都语 (巴基斯坦)-Uzma-女': 'ur-PK-UzmaNeural',
273
+ '乌兹别克语 (乌兹别克斯坦)-Madina-女': 'uz-UZ-MadinaNeural',
274
+ '乌兹别克语 (乌兹别克斯坦)-Sardor-男': 'uz-UZ-SardorNeural',
275
+ '普通话 (中国大陆)-Xiaoxiao-女': 'zh-CN-XiaoxiaoNeural',
276
+ '普通话 (中国大陆)-Yunyang-男': 'zh-CN-YunyangNeural',
277
+ '普通话 (中国大陆)-Yunxi-男': 'zh-CN-YunxiNeural',
278
+ '普通话 (中国大陆)-Xiaoyi-女': 'zh-CN-XiaoyiNeural',
279
+ '普通话 (中国大陆)-Yunjian-男': 'zh-CN-YunjianNeural',
280
+ '普通话 (中国大陆)-Yunxia-男': 'zh-CN-YunxiaNeural',
281
+ '东北话 (中国大陆)-Xiaobei-女': 'zh-CN-liaoning-XiaobeiNeural',
282
+ '中原官话 (中国陕西)-Xiaoni-女': 'zh-CN-shaanxi-XiaoniNeural',
283
+ '粤语 (中国香港)-HiuMaan-女': 'zh-HK-HiuMaanNeural',
284
+ '粤语 (中国香港)-HiuGaai-女': 'zh-HK-HiuGaaiNeural',
285
+ '粤语 (中国香港)-WanLung-男': 'zh-HK-WanLungNeural',
286
+ '台湾普通话-HsiaoChen-女': 'zh-TW-HsiaoChenNeural',
287
+ '台湾普通话-HsiaoYu-女': 'zh-TW-HsiaoYuNeural',
288
+ '台湾普通话-YunJhe-男': 'zh-TW-YunJheNeural',
289
+ '祖鲁语 (南非)-Thando-女': 'zu-ZA-ThandoNeural',
290
+ '祖鲁语 (南非)-Themba-男': 'zu-ZA-ThembaNeural'}
utils.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import argparse
4
+ import logging
5
+ import json
6
+ import subprocess
7
+ import numpy as np
8
+ from scipy.io.wavfile import read
9
+ import torch
10
+ from torch.nn import functional as F
11
+ from commons import sequence_mask
12
+
13
+ MATPLOTLIB_FLAG = False
14
+
15
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
16
+ logger = logging
17
+
18
+
19
+ def get_cmodel(rank):
20
+ checkpoint = torch.load('wavlm/WavLM-Large.pt')
21
+ cfg = WavLMConfig(checkpoint['cfg'])
22
+ cmodel = WavLM(cfg).cuda(rank)
23
+ cmodel.load_state_dict(checkpoint['model'])
24
+ cmodel.eval()
25
+ return cmodel
26
+
27
+
28
+ def get_content(cmodel, y):
29
+ with torch.no_grad():
30
+ c = cmodel.extract_features(y.squeeze(1))[0]
31
+ c = c.transpose(1, 2)
32
+ return c
33
+
34
+
35
+ def get_vocoder(rank):
36
+ with open("hifigan/config.json", "r") as f:
37
+ config = json.load(f)
38
+ config = hifigan.AttrDict(config)
39
+ vocoder = hifigan.Generator(config)
40
+ ckpt = torch.load("hifigan/generator_v1")
41
+ vocoder.load_state_dict(ckpt["generator"])
42
+ vocoder.eval()
43
+ vocoder.remove_weight_norm()
44
+ vocoder.cuda(rank)
45
+ return vocoder
46
+
47
+
48
+ def transform(mel, height): # 68-92
49
+ #r = np.random.random()
50
+ #rate = r * 0.3 + 0.85 # 0.85-1.15
51
+ #height = int(mel.size(-2) * rate)
52
+ tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
53
+ if height >= mel.size(-2):
54
+ return tgt[:, :mel.size(-2), :]
55
+ else:
56
+ silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
57
+ silence += torch.randn_like(silence) / 10
58
+ return torch.cat((tgt, silence), 1)
59
+
60
+
61
+ def stretch(mel, width): # 0.5-2
62
+ return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
63
+
64
+
65
+ def load_checkpoint(checkpoint_path, model, optimizer=None):
66
+ assert os.path.isfile(checkpoint_path)
67
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
68
+ iteration = checkpoint_dict['iteration']
69
+ learning_rate = checkpoint_dict['learning_rate']
70
+ if optimizer is not None:
71
+ optimizer.load_state_dict(checkpoint_dict['optimizer'])
72
+ saved_state_dict = checkpoint_dict['model']
73
+ if hasattr(model, 'module'):
74
+ state_dict = model.module.state_dict()
75
+ else:
76
+ state_dict = model.state_dict()
77
+ new_state_dict= {}
78
+ for k, v in state_dict.items():
79
+ try:
80
+ new_state_dict[k] = saved_state_dict[k]
81
+ except:
82
+ logger.info("%s is not in the checkpoint" % k)
83
+ new_state_dict[k] = v
84
+ if hasattr(model, 'module'):
85
+ model.module.load_state_dict(new_state_dict)
86
+ else:
87
+ model.load_state_dict(new_state_dict)
88
+ logger.info("Loaded checkpoint '{}' (iteration {})" .format(
89
+ checkpoint_path, iteration))
90
+ return model, optimizer, learning_rate, iteration
91
+
92
+
93
+ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
94
+ logger.info("Saving model and optimizer state at iteration {} to {}".format(
95
+ iteration, checkpoint_path))
96
+ if hasattr(model, 'module'):
97
+ state_dict = model.module.state_dict()
98
+ else:
99
+ state_dict = model.state_dict()
100
+ torch.save({'model': state_dict,
101
+ 'iteration': iteration,
102
+ 'optimizer': optimizer.state_dict(),
103
+ 'learning_rate': learning_rate}, checkpoint_path)
104
+
105
+
106
+ def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
107
+ for k, v in scalars.items():
108
+ writer.add_scalar(k, v, global_step)
109
+ for k, v in histograms.items():
110
+ writer.add_histogram(k, v, global_step)
111
+ for k, v in images.items():
112
+ writer.add_image(k, v, global_step, dataformats='HWC')
113
+ for k, v in audios.items():
114
+ writer.add_audio(k, v, global_step, audio_sampling_rate)
115
+
116
+
117
+ def latest_checkpoint_path(dir_path, regex="G_*.pth"):
118
+ f_list = glob.glob(os.path.join(dir_path, regex))
119
+ f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
120
+ x = f_list[-1]
121
+ print(x)
122
+ return x
123
+
124
+
125
+ def plot_spectrogram_to_numpy(spectrogram):
126
+ global MATPLOTLIB_FLAG
127
+ if not MATPLOTLIB_FLAG:
128
+ import matplotlib
129
+ matplotlib.use("Agg")
130
+ MATPLOTLIB_FLAG = True
131
+ mpl_logger = logging.getLogger('matplotlib')
132
+ mpl_logger.setLevel(logging.WARNING)
133
+ import matplotlib.pylab as plt
134
+ import numpy as np
135
+
136
+ fig, ax = plt.subplots(figsize=(10,2))
137
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
138
+ interpolation='none')
139
+ plt.colorbar(im, ax=ax)
140
+ plt.xlabel("Frames")
141
+ plt.ylabel("Channels")
142
+ plt.tight_layout()
143
+
144
+ fig.canvas.draw()
145
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
146
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
147
+ plt.close()
148
+ return data
149
+
150
+
151
+ def plot_alignment_to_numpy(alignment, info=None):
152
+ global MATPLOTLIB_FLAG
153
+ if not MATPLOTLIB_FLAG:
154
+ import matplotlib
155
+ matplotlib.use("Agg")
156
+ MATPLOTLIB_FLAG = True
157
+ mpl_logger = logging.getLogger('matplotlib')
158
+ mpl_logger.setLevel(logging.WARNING)
159
+ import matplotlib.pylab as plt
160
+ import numpy as np
161
+
162
+ fig, ax = plt.subplots(figsize=(6, 4))
163
+ im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
164
+ interpolation='none')
165
+ fig.colorbar(im, ax=ax)
166
+ xlabel = 'Decoder timestep'
167
+ if info is not None:
168
+ xlabel += '\n\n' + info
169
+ plt.xlabel(xlabel)
170
+ plt.ylabel('Encoder timestep')
171
+ plt.tight_layout()
172
+
173
+ fig.canvas.draw()
174
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
175
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
176
+ plt.close()
177
+ return data
178
+
179
+
180
+ def load_wav_to_torch(full_path):
181
+ sampling_rate, data = read(full_path)
182
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
183
+
184
+
185
+ def load_filepaths_and_text(filename, split="|"):
186
+ with open(filename, encoding='utf-8') as f:
187
+ filepaths_and_text = [line.strip().split(split) for line in f]
188
+ return filepaths_and_text
189
+
190
+
191
+ def get_hparams(init=True):
192
+ parser = argparse.ArgumentParser()
193
+ parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
194
+ help='JSON file for configuration')
195
+ parser.add_argument('-m', '--model', type=str, required=True,
196
+ help='Model name')
197
+
198
+ args = parser.parse_args()
199
+ model_dir = os.path.join("./logs", args.model)
200
+
201
+ if not os.path.exists(model_dir):
202
+ os.makedirs(model_dir)
203
+
204
+ config_path = args.config
205
+ config_save_path = os.path.join(model_dir, "config.json")
206
+ if init:
207
+ with open(config_path, "r") as f:
208
+ data = f.read()
209
+ with open(config_save_path, "w") as f:
210
+ f.write(data)
211
+ else:
212
+ with open(config_save_path, "r") as f:
213
+ data = f.read()
214
+ config = json.loads(data)
215
+
216
+ hparams = HParams(**config)
217
+ hparams.model_dir = model_dir
218
+ return hparams
219
+
220
+
221
+ def get_hparams_from_dir(model_dir):
222
+ config_save_path = os.path.join(model_dir, "config.json")
223
+ with open(config_save_path, "r") as f:
224
+ data = f.read()
225
+ config = json.loads(data)
226
+
227
+ hparams =HParams(**config)
228
+ hparams.model_dir = model_dir
229
+ return hparams
230
+
231
+
232
+ def get_hparams_from_file(config_path):
233
+ with open(config_path, "r") as f:
234
+ data = f.read()
235
+ config = json.loads(data)
236
+
237
+ hparams =HParams(**config)
238
+ return hparams
239
+
240
+
241
+ def check_git_hash(model_dir):
242
+ source_dir = os.path.dirname(os.path.realpath(__file__))
243
+ if not os.path.exists(os.path.join(source_dir, ".git")):
244
+ logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
245
+ source_dir
246
+ ))
247
+ return
248
+
249
+ cur_hash = subprocess.getoutput("git rev-parse HEAD")
250
+
251
+ path = os.path.join(model_dir, "githash")
252
+ if os.path.exists(path):
253
+ saved_hash = open(path).read()
254
+ if saved_hash != cur_hash:
255
+ logger.warn("git hash values are different. {}(saved) != {}(current)".format(
256
+ saved_hash[:8], cur_hash[:8]))
257
+ else:
258
+ open(path, "w").write(cur_hash)
259
+
260
+
261
+ def get_logger(model_dir, filename="train.log"):
262
+ global logger
263
+ logger = logging.getLogger(os.path.basename(model_dir))
264
+ logger.setLevel(logging.DEBUG)
265
+
266
+ formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
267
+ if not os.path.exists(model_dir):
268
+ os.makedirs(model_dir)
269
+ h = logging.FileHandler(os.path.join(model_dir, filename))
270
+ h.setLevel(logging.DEBUG)
271
+ h.setFormatter(formatter)
272
+ logger.addHandler(h)
273
+ return logger
274
+
275
+
276
+ class HParams():
277
+ def __init__(self, **kwargs):
278
+ for k, v in kwargs.items():
279
+ if type(v) == dict:
280
+ v = HParams(**v)
281
+ self[k] = v
282
+
283
+ def keys(self):
284
+ return self.__dict__.keys()
285
+
286
+ def items(self):
287
+ return self.__dict__.items()
288
+
289
+ def values(self):
290
+ return self.__dict__.values()
291
+
292
+ def __len__(self):
293
+ return len(self.__dict__)
294
+
295
+ def __getitem__(self, key):
296
+ return getattr(self, key)
297
+
298
+ def __setitem__(self, key, value):
299
+ return setattr(self, key, value)
300
+
301
+ def __contains__(self, key):
302
+ return key in self.__dict__
303
+
304
+ def __repr__(self):
305
+ return self.__dict__.__repr__()