Kit-Lemonfoot commited on
Commit
6613f80
1 Parent(s): c8b0ba8

Fixed deprecation issue, added new EN and Gram Pico.

Browse files
.gitattributes CHANGED
@@ -385,3 +385,9 @@ weights/holostars/Rikka/added_IVF2819_Flat_nprobe_1_Rikkaroid_Hybrid_KitLemonfoo
385
  weights/other/Sakana/added_IVF853_Flat_nprobe_1_h2osakana_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
386
  weights/phaseconnect/Runie/added_IVF1386_Flat_nprobe_1_runie_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
387
  weights/phaseconnect/Lumi/added_IVF1260_Flat_nprobe_1_kanekolumi_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
385
  weights/other/Sakana/added_IVF853_Flat_nprobe_1_h2osakana_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
386
  weights/phaseconnect/Runie/added_IVF1386_Flat_nprobe_1_runie_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
387
  weights/phaseconnect/Lumi/added_IVF1260_Flat_nprobe_1_kanekolumi_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
388
+ weights/hololive-en/Cecilia/added_IVF1477_Flat_nprobe_1_CeciliaImmergreen_Singing_KitLemonfoot_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
389
+ weights/hololive-en/Elizabeth/added_IVF1418_Flat_nprobe_1_Elizabeth_Rose_Bloodflame_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
390
+ weights/hololive-en/Elizabeth/added_IVF1418_Flat_nprobe_1_Elizabeth_Rose_Bloodflame_v2.index filter=lfs diff=lfs merge=lfs -text
391
+ weights/hololive-en/Gigi/added_IVF1648_Flat_nprobe_1_gigi-murin_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
392
+ weights/hololive-en/Raora/added_IVF2050_Flat_nprobe_1_raora_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
393
+ weights/phaseconnect/Pico/added_IVF1132_Flat_nprobe_1_grampico_v2_mbkm.index filter=lfs diff=lfs merge=lfs -text
lib/infer_pack/models.py CHANGED
@@ -1,1124 +1,1125 @@
1
- import math, pdb, os
2
- from time import time as ttime
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
- from lib.infer_pack import modules
7
- from lib.infer_pack import attentions
8
- from lib.infer_pack import commons
9
- from lib.infer_pack.commons import init_weights, get_padding
10
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from lib.infer_pack.commons import init_weights
13
- import numpy as np
14
- from lib.infer_pack import commons
15
-
16
-
17
- class TextEncoder256(nn.Module):
18
- def __init__(
19
- self,
20
- out_channels,
21
- hidden_channels,
22
- filter_channels,
23
- n_heads,
24
- n_layers,
25
- kernel_size,
26
- p_dropout,
27
- f0=True,
28
- ):
29
- super().__init__()
30
- self.out_channels = out_channels
31
- self.hidden_channels = hidden_channels
32
- self.filter_channels = filter_channels
33
- self.n_heads = n_heads
34
- self.n_layers = n_layers
35
- self.kernel_size = kernel_size
36
- self.p_dropout = p_dropout
37
- self.emb_phone = nn.Linear(256, hidden_channels)
38
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
- if f0 == True:
40
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
- self.encoder = attentions.Encoder(
42
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
- )
44
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
-
46
- def forward(self, phone, pitch, lengths):
47
- if pitch == None:
48
- x = self.emb_phone(phone)
49
- else:
50
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
- x = self.lrelu(x)
53
- x = torch.transpose(x, 1, -1) # [b, h, t]
54
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
- x.dtype
56
- )
57
- x = self.encoder(x * x_mask, x_mask)
58
- stats = self.proj(x) * x_mask
59
-
60
- m, logs = torch.split(stats, self.out_channels, dim=1)
61
- return m, logs, x_mask
62
-
63
-
64
- class TextEncoder768(nn.Module):
65
- def __init__(
66
- self,
67
- out_channels,
68
- hidden_channels,
69
- filter_channels,
70
- n_heads,
71
- n_layers,
72
- kernel_size,
73
- p_dropout,
74
- f0=True,
75
- ):
76
- super().__init__()
77
- self.out_channels = out_channels
78
- self.hidden_channels = hidden_channels
79
- self.filter_channels = filter_channels
80
- self.n_heads = n_heads
81
- self.n_layers = n_layers
82
- self.kernel_size = kernel_size
83
- self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(768, hidden_channels)
85
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
- if f0 == True:
87
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
- self.encoder = attentions.Encoder(
89
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
- )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
-
93
- def forward(self, phone, pitch, lengths):
94
- if pitch == None:
95
- x = self.emb_phone(phone)
96
- else:
97
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
- x = self.lrelu(x)
100
- x = torch.transpose(x, 1, -1) # [b, h, t]
101
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
- x.dtype
103
- )
104
- x = self.encoder(x * x_mask, x_mask)
105
- stats = self.proj(x) * x_mask
106
-
107
- m, logs = torch.split(stats, self.out_channels, dim=1)
108
- return m, logs, x_mask
109
-
110
-
111
- class ResidualCouplingBlock(nn.Module):
112
- def __init__(
113
- self,
114
- channels,
115
- hidden_channels,
116
- kernel_size,
117
- dilation_rate,
118
- n_layers,
119
- n_flows=4,
120
- gin_channels=0,
121
- ):
122
- super().__init__()
123
- self.channels = channels
124
- self.hidden_channels = hidden_channels
125
- self.kernel_size = kernel_size
126
- self.dilation_rate = dilation_rate
127
- self.n_layers = n_layers
128
- self.n_flows = n_flows
129
- self.gin_channels = gin_channels
130
-
131
- self.flows = nn.ModuleList()
132
- for i in range(n_flows):
133
- self.flows.append(
134
- modules.ResidualCouplingLayer(
135
- channels,
136
- hidden_channels,
137
- kernel_size,
138
- dilation_rate,
139
- n_layers,
140
- gin_channels=gin_channels,
141
- mean_only=True,
142
- )
143
- )
144
- self.flows.append(modules.Flip())
145
-
146
- def forward(self, x, x_mask, g=None, reverse=False):
147
- if not reverse:
148
- for flow in self.flows:
149
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
- else:
151
- for flow in reversed(self.flows):
152
- x = flow(x, x_mask, g=g, reverse=reverse)
153
- return x
154
-
155
- def remove_weight_norm(self):
156
- for i in range(self.n_flows):
157
- self.flows[i * 2].remove_weight_norm()
158
-
159
-
160
- class PosteriorEncoder(nn.Module):
161
- def __init__(
162
- self,
163
- in_channels,
164
- out_channels,
165
- hidden_channels,
166
- kernel_size,
167
- dilation_rate,
168
- n_layers,
169
- gin_channels=0,
170
- ):
171
- super().__init__()
172
- self.in_channels = in_channels
173
- self.out_channels = out_channels
174
- self.hidden_channels = hidden_channels
175
- self.kernel_size = kernel_size
176
- self.dilation_rate = dilation_rate
177
- self.n_layers = n_layers
178
- self.gin_channels = gin_channels
179
-
180
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
- self.enc = modules.WN(
182
- hidden_channels,
183
- kernel_size,
184
- dilation_rate,
185
- n_layers,
186
- gin_channels=gin_channels,
187
- )
188
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
-
190
- def forward(self, x, x_lengths, g=None):
191
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
- x.dtype
193
- )
194
- x = self.pre(x) * x_mask
195
- x = self.enc(x, x_mask, g=g)
196
- stats = self.proj(x) * x_mask
197
- m, logs = torch.split(stats, self.out_channels, dim=1)
198
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
- return z, m, logs, x_mask
200
-
201
- def remove_weight_norm(self):
202
- self.enc.remove_weight_norm()
203
-
204
-
205
- class Generator(torch.nn.Module):
206
- def __init__(
207
- self,
208
- initial_channel,
209
- resblock,
210
- resblock_kernel_sizes,
211
- resblock_dilation_sizes,
212
- upsample_rates,
213
- upsample_initial_channel,
214
- upsample_kernel_sizes,
215
- gin_channels=0,
216
- ):
217
- super(Generator, self).__init__()
218
- self.num_kernels = len(resblock_kernel_sizes)
219
- self.num_upsamples = len(upsample_rates)
220
- self.conv_pre = Conv1d(
221
- initial_channel, upsample_initial_channel, 7, 1, padding=3
222
- )
223
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
-
225
- self.ups = nn.ModuleList()
226
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
- self.ups.append(
228
- weight_norm(
229
- ConvTranspose1d(
230
- upsample_initial_channel // (2**i),
231
- upsample_initial_channel // (2 ** (i + 1)),
232
- k,
233
- u,
234
- padding=(k - u) // 2,
235
- )
236
- )
237
- )
238
-
239
- self.resblocks = nn.ModuleList()
240
- for i in range(len(self.ups)):
241
- ch = upsample_initial_channel // (2 ** (i + 1))
242
- for j, (k, d) in enumerate(
243
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
- ):
245
- self.resblocks.append(resblock(ch, k, d))
246
-
247
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
- self.ups.apply(init_weights)
249
-
250
- if gin_channels != 0:
251
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
-
253
- def forward(self, x, g=None):
254
- x = self.conv_pre(x)
255
- if g is not None:
256
- x = x + self.cond(g)
257
-
258
- for i in range(self.num_upsamples):
259
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
- x = self.ups[i](x)
261
- xs = None
262
- for j in range(self.num_kernels):
263
- if xs is None:
264
- xs = self.resblocks[i * self.num_kernels + j](x)
265
- else:
266
- xs += self.resblocks[i * self.num_kernels + j](x)
267
- x = xs / self.num_kernels
268
- x = F.leaky_relu(x)
269
- x = self.conv_post(x)
270
- x = torch.tanh(x)
271
-
272
- return x
273
-
274
- def remove_weight_norm(self):
275
- for l in self.ups:
276
- remove_weight_norm(l)
277
- for l in self.resblocks:
278
- l.remove_weight_norm()
279
-
280
-
281
- class SineGen(torch.nn.Module):
282
- """Definition of sine generator
283
- SineGen(samp_rate, harmonic_num = 0,
284
- sine_amp = 0.1, noise_std = 0.003,
285
- voiced_threshold = 0,
286
- flag_for_pulse=False)
287
- samp_rate: sampling rate in Hz
288
- harmonic_num: number of harmonic overtones (default 0)
289
- sine_amp: amplitude of sine-wavefrom (default 0.1)
290
- noise_std: std of Gaussian noise (default 0.003)
291
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
- Note: when flag_for_pulse is True, the first time step of a voiced
294
- segment is always sin(np.pi) or cos(0)
295
- """
296
-
297
- def __init__(
298
- self,
299
- samp_rate,
300
- harmonic_num=0,
301
- sine_amp=0.1,
302
- noise_std=0.003,
303
- voiced_threshold=0,
304
- flag_for_pulse=False,
305
- ):
306
- super(SineGen, self).__init__()
307
- self.sine_amp = sine_amp
308
- self.noise_std = noise_std
309
- self.harmonic_num = harmonic_num
310
- self.dim = self.harmonic_num + 1
311
- self.sampling_rate = samp_rate
312
- self.voiced_threshold = voiced_threshold
313
-
314
- def _f02uv(self, f0):
315
- # generate uv signal
316
- uv = torch.ones_like(f0)
317
- uv = uv * (f0 > self.voiced_threshold)
318
- return uv
319
-
320
- def forward(self, f0, upp):
321
- """sine_tensor, uv = forward(f0)
322
- input F0: tensor(batchsize=1, length, dim=1)
323
- f0 for unvoiced steps should be 0
324
- output sine_tensor: tensor(batchsize=1, length, dim)
325
- output uv: tensor(batchsize=1, length, 1)
326
- """
327
- with torch.no_grad():
328
- f0 = f0[:, None].transpose(1, 2)
329
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
- # fundamental component
331
- f0_buf[:, :, 0] = f0[:, :, 0]
332
- for idx in np.arange(self.harmonic_num):
333
- f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
- idx + 2
335
- ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
- rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
337
- rand_ini = torch.rand(
338
- f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
- )
340
- rand_ini[:, 0] = 0
341
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
- tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
343
- tmp_over_one *= upp
344
- tmp_over_one = F.interpolate(
345
- tmp_over_one.transpose(2, 1),
346
- scale_factor=upp,
347
- mode="linear",
348
- align_corners=True,
349
- ).transpose(2, 1)
350
- rad_values = F.interpolate(
351
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
- ).transpose(
353
- 2, 1
354
- ) #######
355
- tmp_over_one %= 1
356
- tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
- cumsum_shift = torch.zeros_like(rad_values)
358
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
- sine_waves = torch.sin(
360
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
- )
362
- sine_waves = sine_waves * self.sine_amp
363
- uv = self._f02uv(f0)
364
- uv = F.interpolate(
365
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
- ).transpose(2, 1)
367
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
- noise = noise_amp * torch.randn_like(sine_waves)
369
- sine_waves = sine_waves * uv + noise
370
- return sine_waves, uv, noise
371
-
372
-
373
- class SourceModuleHnNSF(torch.nn.Module):
374
- """SourceModule for hn-nsf
375
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
- add_noise_std=0.003, voiced_threshod=0)
377
- sampling_rate: sampling_rate in Hz
378
- harmonic_num: number of harmonic above F0 (default: 0)
379
- sine_amp: amplitude of sine source signal (default: 0.1)
380
- add_noise_std: std of additive Gaussian noise (default: 0.003)
381
- note that amplitude of noise in unvoiced is decided
382
- by sine_amp
383
- voiced_threshold: threhold to set U/V given F0 (default: 0)
384
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
- F0_sampled (batchsize, length, 1)
386
- Sine_source (batchsize, length, 1)
387
- noise_source (batchsize, length 1)
388
- uv (batchsize, length, 1)
389
- """
390
-
391
- def __init__(
392
- self,
393
- sampling_rate,
394
- harmonic_num=0,
395
- sine_amp=0.1,
396
- add_noise_std=0.003,
397
- voiced_threshod=0,
398
- is_half=True,
399
- ):
400
- super(SourceModuleHnNSF, self).__init__()
401
-
402
- self.sine_amp = sine_amp
403
- self.noise_std = add_noise_std
404
- self.is_half = is_half
405
- # to produce sine waveforms
406
- self.l_sin_gen = SineGen(
407
- sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
- )
409
-
410
- # to merge source harmonics into a single excitation
411
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
- self.l_tanh = torch.nn.Tanh()
413
-
414
- def forward(self, x, upp=None):
415
- sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
- if self.is_half:
417
- sine_wavs = sine_wavs.half()
418
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
- return sine_merge, None, None # noise, uv
420
-
421
-
422
- class GeneratorNSF(torch.nn.Module):
423
- def __init__(
424
- self,
425
- initial_channel,
426
- resblock,
427
- resblock_kernel_sizes,
428
- resblock_dilation_sizes,
429
- upsample_rates,
430
- upsample_initial_channel,
431
- upsample_kernel_sizes,
432
- gin_channels,
433
- sr,
434
- is_half=False,
435
- ):
436
- super(GeneratorNSF, self).__init__()
437
- self.num_kernels = len(resblock_kernel_sizes)
438
- self.num_upsamples = len(upsample_rates)
439
-
440
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
- self.m_source = SourceModuleHnNSF(
442
- sampling_rate=sr, harmonic_num=0, is_half=is_half
443
- )
444
- self.noise_convs = nn.ModuleList()
445
- self.conv_pre = Conv1d(
446
- initial_channel, upsample_initial_channel, 7, 1, padding=3
447
- )
448
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
-
450
- self.ups = nn.ModuleList()
451
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
- c_cur = upsample_initial_channel // (2 ** (i + 1))
453
- self.ups.append(
454
- weight_norm(
455
- ConvTranspose1d(
456
- upsample_initial_channel // (2**i),
457
- upsample_initial_channel // (2 ** (i + 1)),
458
- k,
459
- u,
460
- padding=(k - u) // 2,
461
- )
462
- )
463
- )
464
- if i + 1 < len(upsample_rates):
465
- stride_f0 = np.prod(upsample_rates[i + 1 :])
466
- self.noise_convs.append(
467
- Conv1d(
468
- 1,
469
- c_cur,
470
- kernel_size=stride_f0 * 2,
471
- stride=stride_f0,
472
- padding=stride_f0 // 2,
473
- )
474
- )
475
- else:
476
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
-
478
- self.resblocks = nn.ModuleList()
479
- for i in range(len(self.ups)):
480
- ch = upsample_initial_channel // (2 ** (i + 1))
481
- for j, (k, d) in enumerate(
482
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
- ):
484
- self.resblocks.append(resblock(ch, k, d))
485
-
486
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
- self.ups.apply(init_weights)
488
-
489
- if gin_channels != 0:
490
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
-
492
- self.upp = np.prod(upsample_rates)
493
-
494
- def forward(self, x, f0, g=None):
495
- har_source, noi_source, uv = self.m_source(f0, self.upp)
496
- har_source = har_source.transpose(1, 2)
497
- x = self.conv_pre(x)
498
- if g is not None:
499
- x = x + self.cond(g)
500
-
501
- for i in range(self.num_upsamples):
502
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
- x = self.ups[i](x)
504
- x_source = self.noise_convs[i](har_source)
505
- x = x + x_source
506
- xs = None
507
- for j in range(self.num_kernels):
508
- if xs is None:
509
- xs = self.resblocks[i * self.num_kernels + j](x)
510
- else:
511
- xs += self.resblocks[i * self.num_kernels + j](x)
512
- x = xs / self.num_kernels
513
- x = F.leaky_relu(x)
514
- x = self.conv_post(x)
515
- x = torch.tanh(x)
516
- return x
517
-
518
- def remove_weight_norm(self):
519
- for l in self.ups:
520
- remove_weight_norm(l)
521
- for l in self.resblocks:
522
- l.remove_weight_norm()
523
-
524
-
525
- sr2sr = {
526
- "32k": 32000,
527
- "40k": 40000,
528
- "48k": 48000,
529
- }
530
-
531
-
532
- class SynthesizerTrnMs256NSFsid(nn.Module):
533
- def __init__(
534
- self,
535
- spec_channels,
536
- segment_size,
537
- inter_channels,
538
- hidden_channels,
539
- filter_channels,
540
- n_heads,
541
- n_layers,
542
- kernel_size,
543
- p_dropout,
544
- resblock,
545
- resblock_kernel_sizes,
546
- resblock_dilation_sizes,
547
- upsample_rates,
548
- upsample_initial_channel,
549
- upsample_kernel_sizes,
550
- spk_embed_dim,
551
- gin_channels,
552
- sr,
553
- **kwargs
554
- ):
555
- super().__init__()
556
- if type(sr) == type("strr"):
557
- sr = sr2sr[sr]
558
- self.spec_channels = spec_channels
559
- self.inter_channels = inter_channels
560
- self.hidden_channels = hidden_channels
561
- self.filter_channels = filter_channels
562
- self.n_heads = n_heads
563
- self.n_layers = n_layers
564
- self.kernel_size = kernel_size
565
- self.p_dropout = p_dropout
566
- self.resblock = resblock
567
- self.resblock_kernel_sizes = resblock_kernel_sizes
568
- self.resblock_dilation_sizes = resblock_dilation_sizes
569
- self.upsample_rates = upsample_rates
570
- self.upsample_initial_channel = upsample_initial_channel
571
- self.upsample_kernel_sizes = upsample_kernel_sizes
572
- self.segment_size = segment_size
573
- self.gin_channels = gin_channels
574
- # self.hop_length = hop_length#
575
- self.spk_embed_dim = spk_embed_dim
576
- self.enc_p = TextEncoder256(
577
- inter_channels,
578
- hidden_channels,
579
- filter_channels,
580
- n_heads,
581
- n_layers,
582
- kernel_size,
583
- p_dropout,
584
- )
585
- self.dec = GeneratorNSF(
586
- inter_channels,
587
- resblock,
588
- resblock_kernel_sizes,
589
- resblock_dilation_sizes,
590
- upsample_rates,
591
- upsample_initial_channel,
592
- upsample_kernel_sizes,
593
- gin_channels=gin_channels,
594
- sr=sr,
595
- is_half=kwargs["is_half"],
596
- )
597
- self.enc_q = PosteriorEncoder(
598
- spec_channels,
599
- inter_channels,
600
- hidden_channels,
601
- 5,
602
- 1,
603
- 16,
604
- gin_channels=gin_channels,
605
- )
606
- self.flow = ResidualCouplingBlock(
607
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
608
- )
609
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
610
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
611
-
612
- def remove_weight_norm(self):
613
- self.dec.remove_weight_norm()
614
- self.flow.remove_weight_norm()
615
- self.enc_q.remove_weight_norm()
616
-
617
- def forward(
618
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
619
- ): # 这里ds是id,[bs,1]
620
- # print(1,pitch.shape)#[bs,t]
621
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
622
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
623
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
624
- z_p = self.flow(z, y_mask, g=g)
625
- z_slice, ids_slice = commons.rand_slice_segments(
626
- z, y_lengths, self.segment_size
627
- )
628
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
629
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
630
- # print(-2,pitchf.shape,z_slice.shape)
631
- o = self.dec(z_slice, pitchf, g=g)
632
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
633
-
634
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
635
- g = self.emb_g(sid).unsqueeze(-1)
636
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
637
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
638
- z = self.flow(z_p, x_mask, g=g, reverse=True)
639
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
640
- return o, x_mask, (z, z_p, m_p, logs_p)
641
-
642
-
643
- class SynthesizerTrnMs768NSFsid(nn.Module):
644
- def __init__(
645
- self,
646
- spec_channels,
647
- segment_size,
648
- inter_channels,
649
- hidden_channels,
650
- filter_channels,
651
- n_heads,
652
- n_layers,
653
- kernel_size,
654
- p_dropout,
655
- resblock,
656
- resblock_kernel_sizes,
657
- resblock_dilation_sizes,
658
- upsample_rates,
659
- upsample_initial_channel,
660
- upsample_kernel_sizes,
661
- spk_embed_dim,
662
- gin_channels,
663
- sr,
664
- **kwargs
665
- ):
666
- super().__init__()
667
- if type(sr) == type("strr"):
668
- sr = sr2sr[sr]
669
- self.spec_channels = spec_channels
670
- self.inter_channels = inter_channels
671
- self.hidden_channels = hidden_channels
672
- self.filter_channels = filter_channels
673
- self.n_heads = n_heads
674
- self.n_layers = n_layers
675
- self.kernel_size = kernel_size
676
- self.p_dropout = p_dropout
677
- self.resblock = resblock
678
- self.resblock_kernel_sizes = resblock_kernel_sizes
679
- self.resblock_dilation_sizes = resblock_dilation_sizes
680
- self.upsample_rates = upsample_rates
681
- self.upsample_initial_channel = upsample_initial_channel
682
- self.upsample_kernel_sizes = upsample_kernel_sizes
683
- self.segment_size = segment_size
684
- self.gin_channels = gin_channels
685
- # self.hop_length = hop_length#
686
- self.spk_embed_dim = spk_embed_dim
687
- self.enc_p = TextEncoder768(
688
- inter_channels,
689
- hidden_channels,
690
- filter_channels,
691
- n_heads,
692
- n_layers,
693
- kernel_size,
694
- p_dropout,
695
- )
696
- self.dec = GeneratorNSF(
697
- inter_channels,
698
- resblock,
699
- resblock_kernel_sizes,
700
- resblock_dilation_sizes,
701
- upsample_rates,
702
- upsample_initial_channel,
703
- upsample_kernel_sizes,
704
- gin_channels=gin_channels,
705
- sr=sr,
706
- is_half=kwargs["is_half"],
707
- )
708
- self.enc_q = PosteriorEncoder(
709
- spec_channels,
710
- inter_channels,
711
- hidden_channels,
712
- 5,
713
- 1,
714
- 16,
715
- gin_channels=gin_channels,
716
- )
717
- self.flow = ResidualCouplingBlock(
718
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
719
- )
720
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
721
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
722
-
723
- def remove_weight_norm(self):
724
- self.dec.remove_weight_norm()
725
- self.flow.remove_weight_norm()
726
- self.enc_q.remove_weight_norm()
727
-
728
- def forward(
729
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
730
- ): # 这里ds是id,[bs,1]
731
- # print(1,pitch.shape)#[bs,t]
732
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
733
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
734
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
735
- z_p = self.flow(z, y_mask, g=g)
736
- z_slice, ids_slice = commons.rand_slice_segments(
737
- z, y_lengths, self.segment_size
738
- )
739
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
740
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
741
- # print(-2,pitchf.shape,z_slice.shape)
742
- o = self.dec(z_slice, pitchf, g=g)
743
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
744
-
745
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
746
- g = self.emb_g(sid).unsqueeze(-1)
747
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
748
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
749
- z = self.flow(z_p, x_mask, g=g, reverse=True)
750
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
751
- return o, x_mask, (z, z_p, m_p, logs_p)
752
-
753
-
754
- class SynthesizerTrnMs256NSFsid_nono(nn.Module):
755
- def __init__(
756
- self,
757
- spec_channels,
758
- segment_size,
759
- inter_channels,
760
- hidden_channels,
761
- filter_channels,
762
- n_heads,
763
- n_layers,
764
- kernel_size,
765
- p_dropout,
766
- resblock,
767
- resblock_kernel_sizes,
768
- resblock_dilation_sizes,
769
- upsample_rates,
770
- upsample_initial_channel,
771
- upsample_kernel_sizes,
772
- spk_embed_dim,
773
- gin_channels,
774
- sr=None,
775
- **kwargs
776
- ):
777
- super().__init__()
778
- self.spec_channels = spec_channels
779
- self.inter_channels = inter_channels
780
- self.hidden_channels = hidden_channels
781
- self.filter_channels = filter_channels
782
- self.n_heads = n_heads
783
- self.n_layers = n_layers
784
- self.kernel_size = kernel_size
785
- self.p_dropout = p_dropout
786
- self.resblock = resblock
787
- self.resblock_kernel_sizes = resblock_kernel_sizes
788
- self.resblock_dilation_sizes = resblock_dilation_sizes
789
- self.upsample_rates = upsample_rates
790
- self.upsample_initial_channel = upsample_initial_channel
791
- self.upsample_kernel_sizes = upsample_kernel_sizes
792
- self.segment_size = segment_size
793
- self.gin_channels = gin_channels
794
- # self.hop_length = hop_length#
795
- self.spk_embed_dim = spk_embed_dim
796
- self.enc_p = TextEncoder256(
797
- inter_channels,
798
- hidden_channels,
799
- filter_channels,
800
- n_heads,
801
- n_layers,
802
- kernel_size,
803
- p_dropout,
804
- f0=False,
805
- )
806
- self.dec = Generator(
807
- inter_channels,
808
- resblock,
809
- resblock_kernel_sizes,
810
- resblock_dilation_sizes,
811
- upsample_rates,
812
- upsample_initial_channel,
813
- upsample_kernel_sizes,
814
- gin_channels=gin_channels,
815
- )
816
- self.enc_q = PosteriorEncoder(
817
- spec_channels,
818
- inter_channels,
819
- hidden_channels,
820
- 5,
821
- 1,
822
- 16,
823
- gin_channels=gin_channels,
824
- )
825
- self.flow = ResidualCouplingBlock(
826
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
827
- )
828
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
829
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
830
-
831
- def remove_weight_norm(self):
832
- self.dec.remove_weight_norm()
833
- self.flow.remove_weight_norm()
834
- self.enc_q.remove_weight_norm()
835
-
836
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
837
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
838
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
839
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
840
- z_p = self.flow(z, y_mask, g=g)
841
- z_slice, ids_slice = commons.rand_slice_segments(
842
- z, y_lengths, self.segment_size
843
- )
844
- o = self.dec(z_slice, g=g)
845
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
846
-
847
- def infer(self, phone, phone_lengths, sid, max_len=None):
848
- g = self.emb_g(sid).unsqueeze(-1)
849
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
850
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
851
- z = self.flow(z_p, x_mask, g=g, reverse=True)
852
- o = self.dec((z * x_mask)[:, :, :max_len], g=g)
853
- return o, x_mask, (z, z_p, m_p, logs_p)
854
-
855
-
856
- class SynthesizerTrnMs768NSFsid_nono(nn.Module):
857
- def __init__(
858
- self,
859
- spec_channels,
860
- segment_size,
861
- inter_channels,
862
- hidden_channels,
863
- filter_channels,
864
- n_heads,
865
- n_layers,
866
- kernel_size,
867
- p_dropout,
868
- resblock,
869
- resblock_kernel_sizes,
870
- resblock_dilation_sizes,
871
- upsample_rates,
872
- upsample_initial_channel,
873
- upsample_kernel_sizes,
874
- spk_embed_dim,
875
- gin_channels,
876
- sr=None,
877
- **kwargs
878
- ):
879
- super().__init__()
880
- self.spec_channels = spec_channels
881
- self.inter_channels = inter_channels
882
- self.hidden_channels = hidden_channels
883
- self.filter_channels = filter_channels
884
- self.n_heads = n_heads
885
- self.n_layers = n_layers
886
- self.kernel_size = kernel_size
887
- self.p_dropout = p_dropout
888
- self.resblock = resblock
889
- self.resblock_kernel_sizes = resblock_kernel_sizes
890
- self.resblock_dilation_sizes = resblock_dilation_sizes
891
- self.upsample_rates = upsample_rates
892
- self.upsample_initial_channel = upsample_initial_channel
893
- self.upsample_kernel_sizes = upsample_kernel_sizes
894
- self.segment_size = segment_size
895
- self.gin_channels = gin_channels
896
- # self.hop_length = hop_length#
897
- self.spk_embed_dim = spk_embed_dim
898
- self.enc_p = TextEncoder768(
899
- inter_channels,
900
- hidden_channels,
901
- filter_channels,
902
- n_heads,
903
- n_layers,
904
- kernel_size,
905
- p_dropout,
906
- f0=False,
907
- )
908
- self.dec = Generator(
909
- inter_channels,
910
- resblock,
911
- resblock_kernel_sizes,
912
- resblock_dilation_sizes,
913
- upsample_rates,
914
- upsample_initial_channel,
915
- upsample_kernel_sizes,
916
- gin_channels=gin_channels,
917
- )
918
- self.enc_q = PosteriorEncoder(
919
- spec_channels,
920
- inter_channels,
921
- hidden_channels,
922
- 5,
923
- 1,
924
- 16,
925
- gin_channels=gin_channels,
926
- )
927
- self.flow = ResidualCouplingBlock(
928
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
929
- )
930
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
931
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
932
-
933
- def remove_weight_norm(self):
934
- self.dec.remove_weight_norm()
935
- self.flow.remove_weight_norm()
936
- self.enc_q.remove_weight_norm()
937
-
938
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
939
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
940
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
941
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
942
- z_p = self.flow(z, y_mask, g=g)
943
- z_slice, ids_slice = commons.rand_slice_segments(
944
- z, y_lengths, self.segment_size
945
- )
946
- o = self.dec(z_slice, g=g)
947
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
948
-
949
- def infer(self, phone, phone_lengths, sid, max_len=None):
950
- g = self.emb_g(sid).unsqueeze(-1)
951
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
952
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
953
- z = self.flow(z_p, x_mask, g=g, reverse=True)
954
- o = self.dec((z * x_mask)[:, :, :max_len], g=g)
955
- return o, x_mask, (z, z_p, m_p, logs_p)
956
-
957
-
958
- class MultiPeriodDiscriminator(torch.nn.Module):
959
- def __init__(self, use_spectral_norm=False):
960
- super(MultiPeriodDiscriminator, self).__init__()
961
- periods = [2, 3, 5, 7, 11, 17]
962
- # periods = [3, 5, 7, 11, 17, 23, 37]
963
-
964
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
965
- discs = discs + [
966
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
967
- ]
968
- self.discriminators = nn.ModuleList(discs)
969
-
970
- def forward(self, y, y_hat):
971
- y_d_rs = [] #
972
- y_d_gs = []
973
- fmap_rs = []
974
- fmap_gs = []
975
- for i, d in enumerate(self.discriminators):
976
- y_d_r, fmap_r = d(y)
977
- y_d_g, fmap_g = d(y_hat)
978
- # for j in range(len(fmap_r)):
979
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
980
- y_d_rs.append(y_d_r)
981
- y_d_gs.append(y_d_g)
982
- fmap_rs.append(fmap_r)
983
- fmap_gs.append(fmap_g)
984
-
985
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
986
-
987
-
988
- class MultiPeriodDiscriminatorV2(torch.nn.Module):
989
- def __init__(self, use_spectral_norm=False):
990
- super(MultiPeriodDiscriminatorV2, self).__init__()
991
- # periods = [2, 3, 5, 7, 11, 17]
992
- periods = [2, 3, 5, 7, 11, 17, 23, 37]
993
-
994
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
995
- discs = discs + [
996
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
997
- ]
998
- self.discriminators = nn.ModuleList(discs)
999
-
1000
- def forward(self, y, y_hat):
1001
- y_d_rs = [] #
1002
- y_d_gs = []
1003
- fmap_rs = []
1004
- fmap_gs = []
1005
- for i, d in enumerate(self.discriminators):
1006
- y_d_r, fmap_r = d(y)
1007
- y_d_g, fmap_g = d(y_hat)
1008
- # for j in range(len(fmap_r)):
1009
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1010
- y_d_rs.append(y_d_r)
1011
- y_d_gs.append(y_d_g)
1012
- fmap_rs.append(fmap_r)
1013
- fmap_gs.append(fmap_g)
1014
-
1015
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1016
-
1017
-
1018
- class DiscriminatorS(torch.nn.Module):
1019
- def __init__(self, use_spectral_norm=False):
1020
- super(DiscriminatorS, self).__init__()
1021
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1022
- self.convs = nn.ModuleList(
1023
- [
1024
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1025
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1026
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1027
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1028
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1029
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1030
- ]
1031
- )
1032
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1033
-
1034
- def forward(self, x):
1035
- fmap = []
1036
-
1037
- for l in self.convs:
1038
- x = l(x)
1039
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1040
- fmap.append(x)
1041
- x = self.conv_post(x)
1042
- fmap.append(x)
1043
- x = torch.flatten(x, 1, -1)
1044
-
1045
- return x, fmap
1046
-
1047
-
1048
- class DiscriminatorP(torch.nn.Module):
1049
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1050
- super(DiscriminatorP, self).__init__()
1051
- self.period = period
1052
- self.use_spectral_norm = use_spectral_norm
1053
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1054
- self.convs = nn.ModuleList(
1055
- [
1056
- norm_f(
1057
- Conv2d(
1058
- 1,
1059
- 32,
1060
- (kernel_size, 1),
1061
- (stride, 1),
1062
- padding=(get_padding(kernel_size, 1), 0),
1063
- )
1064
- ),
1065
- norm_f(
1066
- Conv2d(
1067
- 32,
1068
- 128,
1069
- (kernel_size, 1),
1070
- (stride, 1),
1071
- padding=(get_padding(kernel_size, 1), 0),
1072
- )
1073
- ),
1074
- norm_f(
1075
- Conv2d(
1076
- 128,
1077
- 512,
1078
- (kernel_size, 1),
1079
- (stride, 1),
1080
- padding=(get_padding(kernel_size, 1), 0),
1081
- )
1082
- ),
1083
- norm_f(
1084
- Conv2d(
1085
- 512,
1086
- 1024,
1087
- (kernel_size, 1),
1088
- (stride, 1),
1089
- padding=(get_padding(kernel_size, 1), 0),
1090
- )
1091
- ),
1092
- norm_f(
1093
- Conv2d(
1094
- 1024,
1095
- 1024,
1096
- (kernel_size, 1),
1097
- 1,
1098
- padding=(get_padding(kernel_size, 1), 0),
1099
- )
1100
- ),
1101
- ]
1102
- )
1103
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1104
-
1105
- def forward(self, x):
1106
- fmap = []
1107
-
1108
- # 1d to 2d
1109
- b, c, t = x.shape
1110
- if t % self.period != 0: # pad first
1111
- n_pad = self.period - (t % self.period)
1112
- x = F.pad(x, (0, n_pad), "reflect")
1113
- t = t + n_pad
1114
- x = x.view(b, c, t // self.period, self.period)
1115
-
1116
- for l in self.convs:
1117
- x = l(x)
1118
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1119
- fmap.append(x)
1120
- x = self.conv_post(x)
1121
- fmap.append(x)
1122
- x = torch.flatten(x, 1, -1)
1123
-
1124
- return x, fmap
 
 
1
+ import math, pdb, os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from lib.infer_pack import modules
7
+ from lib.infer_pack import attentions
8
+ from lib.infer_pack import commons
9
+ from lib.infer_pack.commons import init_weights, get_padding
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import remove_weight_norm
12
+ from torch.nn.utils.parametrizations import spectral_norm, weight_norm
13
+ from lib.infer_pack.commons import init_weights
14
+ import numpy as np
15
+ from lib.infer_pack import commons
16
+
17
+
18
+ class TextEncoder256(nn.Module):
19
+ def __init__(
20
+ self,
21
+ out_channels,
22
+ hidden_channels,
23
+ filter_channels,
24
+ n_heads,
25
+ n_layers,
26
+ kernel_size,
27
+ p_dropout,
28
+ f0=True,
29
+ ):
30
+ super().__init__()
31
+ self.out_channels = out_channels
32
+ self.hidden_channels = hidden_channels
33
+ self.filter_channels = filter_channels
34
+ self.n_heads = n_heads
35
+ self.n_layers = n_layers
36
+ self.kernel_size = kernel_size
37
+ self.p_dropout = p_dropout
38
+ self.emb_phone = nn.Linear(256, hidden_channels)
39
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
40
+ if f0 == True:
41
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
42
+ self.encoder = attentions.Encoder(
43
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
44
+ )
45
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
46
+
47
+ def forward(self, phone, pitch, lengths):
48
+ if pitch == None:
49
+ x = self.emb_phone(phone)
50
+ else:
51
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
52
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
53
+ x = self.lrelu(x)
54
+ x = torch.transpose(x, 1, -1) # [b, h, t]
55
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
56
+ x.dtype
57
+ )
58
+ x = self.encoder(x * x_mask, x_mask)
59
+ stats = self.proj(x) * x_mask
60
+
61
+ m, logs = torch.split(stats, self.out_channels, dim=1)
62
+ return m, logs, x_mask
63
+
64
+
65
+ class TextEncoder768(nn.Module):
66
+ def __init__(
67
+ self,
68
+ out_channels,
69
+ hidden_channels,
70
+ filter_channels,
71
+ n_heads,
72
+ n_layers,
73
+ kernel_size,
74
+ p_dropout,
75
+ f0=True,
76
+ ):
77
+ super().__init__()
78
+ self.out_channels = out_channels
79
+ self.hidden_channels = hidden_channels
80
+ self.filter_channels = filter_channels
81
+ self.n_heads = n_heads
82
+ self.n_layers = n_layers
83
+ self.kernel_size = kernel_size
84
+ self.p_dropout = p_dropout
85
+ self.emb_phone = nn.Linear(768, hidden_channels)
86
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
87
+ if f0 == True:
88
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
89
+ self.encoder = attentions.Encoder(
90
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
91
+ )
92
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
93
+
94
+ def forward(self, phone, pitch, lengths):
95
+ if pitch == None:
96
+ x = self.emb_phone(phone)
97
+ else:
98
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
99
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
100
+ x = self.lrelu(x)
101
+ x = torch.transpose(x, 1, -1) # [b, h, t]
102
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
103
+ x.dtype
104
+ )
105
+ x = self.encoder(x * x_mask, x_mask)
106
+ stats = self.proj(x) * x_mask
107
+
108
+ m, logs = torch.split(stats, self.out_channels, dim=1)
109
+ return m, logs, x_mask
110
+
111
+
112
+ class ResidualCouplingBlock(nn.Module):
113
+ def __init__(
114
+ self,
115
+ channels,
116
+ hidden_channels,
117
+ kernel_size,
118
+ dilation_rate,
119
+ n_layers,
120
+ n_flows=4,
121
+ gin_channels=0,
122
+ ):
123
+ super().__init__()
124
+ self.channels = channels
125
+ self.hidden_channels = hidden_channels
126
+ self.kernel_size = kernel_size
127
+ self.dilation_rate = dilation_rate
128
+ self.n_layers = n_layers
129
+ self.n_flows = n_flows
130
+ self.gin_channels = gin_channels
131
+
132
+ self.flows = nn.ModuleList()
133
+ for i in range(n_flows):
134
+ self.flows.append(
135
+ modules.ResidualCouplingLayer(
136
+ channels,
137
+ hidden_channels,
138
+ kernel_size,
139
+ dilation_rate,
140
+ n_layers,
141
+ gin_channels=gin_channels,
142
+ mean_only=True,
143
+ )
144
+ )
145
+ self.flows.append(modules.Flip())
146
+
147
+ def forward(self, x, x_mask, g=None, reverse=False):
148
+ if not reverse:
149
+ for flow in self.flows:
150
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
151
+ else:
152
+ for flow in reversed(self.flows):
153
+ x = flow(x, x_mask, g=g, reverse=reverse)
154
+ return x
155
+
156
+ def remove_weight_norm(self):
157
+ for i in range(self.n_flows):
158
+ self.flows[i * 2].remove_weight_norm()
159
+
160
+
161
+ class PosteriorEncoder(nn.Module):
162
+ def __init__(
163
+ self,
164
+ in_channels,
165
+ out_channels,
166
+ hidden_channels,
167
+ kernel_size,
168
+ dilation_rate,
169
+ n_layers,
170
+ gin_channels=0,
171
+ ):
172
+ super().__init__()
173
+ self.in_channels = in_channels
174
+ self.out_channels = out_channels
175
+ self.hidden_channels = hidden_channels
176
+ self.kernel_size = kernel_size
177
+ self.dilation_rate = dilation_rate
178
+ self.n_layers = n_layers
179
+ self.gin_channels = gin_channels
180
+
181
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
182
+ self.enc = modules.WN(
183
+ hidden_channels,
184
+ kernel_size,
185
+ dilation_rate,
186
+ n_layers,
187
+ gin_channels=gin_channels,
188
+ )
189
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
190
+
191
+ def forward(self, x, x_lengths, g=None):
192
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
193
+ x.dtype
194
+ )
195
+ x = self.pre(x) * x_mask
196
+ x = self.enc(x, x_mask, g=g)
197
+ stats = self.proj(x) * x_mask
198
+ m, logs = torch.split(stats, self.out_channels, dim=1)
199
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
200
+ return z, m, logs, x_mask
201
+
202
+ def remove_weight_norm(self):
203
+ self.enc.remove_weight_norm()
204
+
205
+
206
+ class Generator(torch.nn.Module):
207
+ def __init__(
208
+ self,
209
+ initial_channel,
210
+ resblock,
211
+ resblock_kernel_sizes,
212
+ resblock_dilation_sizes,
213
+ upsample_rates,
214
+ upsample_initial_channel,
215
+ upsample_kernel_sizes,
216
+ gin_channels=0,
217
+ ):
218
+ super(Generator, self).__init__()
219
+ self.num_kernels = len(resblock_kernel_sizes)
220
+ self.num_upsamples = len(upsample_rates)
221
+ self.conv_pre = Conv1d(
222
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
223
+ )
224
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
225
+
226
+ self.ups = nn.ModuleList()
227
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
228
+ self.ups.append(
229
+ weight_norm(
230
+ ConvTranspose1d(
231
+ upsample_initial_channel // (2**i),
232
+ upsample_initial_channel // (2 ** (i + 1)),
233
+ k,
234
+ u,
235
+ padding=(k - u) // 2,
236
+ )
237
+ )
238
+ )
239
+
240
+ self.resblocks = nn.ModuleList()
241
+ for i in range(len(self.ups)):
242
+ ch = upsample_initial_channel // (2 ** (i + 1))
243
+ for j, (k, d) in enumerate(
244
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
245
+ ):
246
+ self.resblocks.append(resblock(ch, k, d))
247
+
248
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
249
+ self.ups.apply(init_weights)
250
+
251
+ if gin_channels != 0:
252
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
253
+
254
+ def forward(self, x, g=None):
255
+ x = self.conv_pre(x)
256
+ if g is not None:
257
+ x = x + self.cond(g)
258
+
259
+ for i in range(self.num_upsamples):
260
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
261
+ x = self.ups[i](x)
262
+ xs = None
263
+ for j in range(self.num_kernels):
264
+ if xs is None:
265
+ xs = self.resblocks[i * self.num_kernels + j](x)
266
+ else:
267
+ xs += self.resblocks[i * self.num_kernels + j](x)
268
+ x = xs / self.num_kernels
269
+ x = F.leaky_relu(x)
270
+ x = self.conv_post(x)
271
+ x = torch.tanh(x)
272
+
273
+ return x
274
+
275
+ def remove_weight_norm(self):
276
+ for l in self.ups:
277
+ remove_weight_norm(l)
278
+ for l in self.resblocks:
279
+ l.remove_weight_norm()
280
+
281
+
282
+ class SineGen(torch.nn.Module):
283
+ """Definition of sine generator
284
+ SineGen(samp_rate, harmonic_num = 0,
285
+ sine_amp = 0.1, noise_std = 0.003,
286
+ voiced_threshold = 0,
287
+ flag_for_pulse=False)
288
+ samp_rate: sampling rate in Hz
289
+ harmonic_num: number of harmonic overtones (default 0)
290
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
291
+ noise_std: std of Gaussian noise (default 0.003)
292
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
293
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
294
+ Note: when flag_for_pulse is True, the first time step of a voiced
295
+ segment is always sin(np.pi) or cos(0)
296
+ """
297
+
298
+ def __init__(
299
+ self,
300
+ samp_rate,
301
+ harmonic_num=0,
302
+ sine_amp=0.1,
303
+ noise_std=0.003,
304
+ voiced_threshold=0,
305
+ flag_for_pulse=False,
306
+ ):
307
+ super(SineGen, self).__init__()
308
+ self.sine_amp = sine_amp
309
+ self.noise_std = noise_std
310
+ self.harmonic_num = harmonic_num
311
+ self.dim = self.harmonic_num + 1
312
+ self.sampling_rate = samp_rate
313
+ self.voiced_threshold = voiced_threshold
314
+
315
+ def _f02uv(self, f0):
316
+ # generate uv signal
317
+ uv = torch.ones_like(f0)
318
+ uv = uv * (f0 > self.voiced_threshold)
319
+ return uv
320
+
321
+ def forward(self, f0, upp):
322
+ """sine_tensor, uv = forward(f0)
323
+ input F0: tensor(batchsize=1, length, dim=1)
324
+ f0 for unvoiced steps should be 0
325
+ output sine_tensor: tensor(batchsize=1, length, dim)
326
+ output uv: tensor(batchsize=1, length, 1)
327
+ """
328
+ with torch.no_grad():
329
+ f0 = f0[:, None].transpose(1, 2)
330
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
331
+ # fundamental component
332
+ f0_buf[:, :, 0] = f0[:, :, 0]
333
+ for idx in np.arange(self.harmonic_num):
334
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
335
+ idx + 2
336
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
337
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
338
+ rand_ini = torch.rand(
339
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
340
+ )
341
+ rand_ini[:, 0] = 0
342
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
343
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
344
+ tmp_over_one *= upp
345
+ tmp_over_one = F.interpolate(
346
+ tmp_over_one.transpose(2, 1),
347
+ scale_factor=upp,
348
+ mode="linear",
349
+ align_corners=True,
350
+ ).transpose(2, 1)
351
+ rad_values = F.interpolate(
352
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
353
+ ).transpose(
354
+ 2, 1
355
+ ) #######
356
+ tmp_over_one %= 1
357
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
358
+ cumsum_shift = torch.zeros_like(rad_values)
359
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
360
+ sine_waves = torch.sin(
361
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
362
+ )
363
+ sine_waves = sine_waves * self.sine_amp
364
+ uv = self._f02uv(f0)
365
+ uv = F.interpolate(
366
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
367
+ ).transpose(2, 1)
368
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
369
+ noise = noise_amp * torch.randn_like(sine_waves)
370
+ sine_waves = sine_waves * uv + noise
371
+ return sine_waves, uv, noise
372
+
373
+
374
+ class SourceModuleHnNSF(torch.nn.Module):
375
+ """SourceModule for hn-nsf
376
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
377
+ add_noise_std=0.003, voiced_threshod=0)
378
+ sampling_rate: sampling_rate in Hz
379
+ harmonic_num: number of harmonic above F0 (default: 0)
380
+ sine_amp: amplitude of sine source signal (default: 0.1)
381
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
382
+ note that amplitude of noise in unvoiced is decided
383
+ by sine_amp
384
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
385
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
386
+ F0_sampled (batchsize, length, 1)
387
+ Sine_source (batchsize, length, 1)
388
+ noise_source (batchsize, length 1)
389
+ uv (batchsize, length, 1)
390
+ """
391
+
392
+ def __init__(
393
+ self,
394
+ sampling_rate,
395
+ harmonic_num=0,
396
+ sine_amp=0.1,
397
+ add_noise_std=0.003,
398
+ voiced_threshod=0,
399
+ is_half=True,
400
+ ):
401
+ super(SourceModuleHnNSF, self).__init__()
402
+
403
+ self.sine_amp = sine_amp
404
+ self.noise_std = add_noise_std
405
+ self.is_half = is_half
406
+ # to produce sine waveforms
407
+ self.l_sin_gen = SineGen(
408
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
409
+ )
410
+
411
+ # to merge source harmonics into a single excitation
412
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
413
+ self.l_tanh = torch.nn.Tanh()
414
+
415
+ def forward(self, x, upp=None):
416
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
417
+ if self.is_half:
418
+ sine_wavs = sine_wavs.half()
419
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
420
+ return sine_merge, None, None # noise, uv
421
+
422
+
423
+ class GeneratorNSF(torch.nn.Module):
424
+ def __init__(
425
+ self,
426
+ initial_channel,
427
+ resblock,
428
+ resblock_kernel_sizes,
429
+ resblock_dilation_sizes,
430
+ upsample_rates,
431
+ upsample_initial_channel,
432
+ upsample_kernel_sizes,
433
+ gin_channels,
434
+ sr,
435
+ is_half=False,
436
+ ):
437
+ super(GeneratorNSF, self).__init__()
438
+ self.num_kernels = len(resblock_kernel_sizes)
439
+ self.num_upsamples = len(upsample_rates)
440
+
441
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
442
+ self.m_source = SourceModuleHnNSF(
443
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
444
+ )
445
+ self.noise_convs = nn.ModuleList()
446
+ self.conv_pre = Conv1d(
447
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
448
+ )
449
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
450
+
451
+ self.ups = nn.ModuleList()
452
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
453
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
454
+ self.ups.append(
455
+ weight_norm(
456
+ ConvTranspose1d(
457
+ upsample_initial_channel // (2**i),
458
+ upsample_initial_channel // (2 ** (i + 1)),
459
+ k,
460
+ u,
461
+ padding=(k - u) // 2,
462
+ )
463
+ )
464
+ )
465
+ if i + 1 < len(upsample_rates):
466
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
467
+ self.noise_convs.append(
468
+ Conv1d(
469
+ 1,
470
+ c_cur,
471
+ kernel_size=stride_f0 * 2,
472
+ stride=stride_f0,
473
+ padding=stride_f0 // 2,
474
+ )
475
+ )
476
+ else:
477
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
478
+
479
+ self.resblocks = nn.ModuleList()
480
+ for i in range(len(self.ups)):
481
+ ch = upsample_initial_channel // (2 ** (i + 1))
482
+ for j, (k, d) in enumerate(
483
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
484
+ ):
485
+ self.resblocks.append(resblock(ch, k, d))
486
+
487
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
488
+ self.ups.apply(init_weights)
489
+
490
+ if gin_channels != 0:
491
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
492
+
493
+ self.upp = np.prod(upsample_rates)
494
+
495
+ def forward(self, x, f0, g=None):
496
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
497
+ har_source = har_source.transpose(1, 2)
498
+ x = self.conv_pre(x)
499
+ if g is not None:
500
+ x = x + self.cond(g)
501
+
502
+ for i in range(self.num_upsamples):
503
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
504
+ x = self.ups[i](x)
505
+ x_source = self.noise_convs[i](har_source)
506
+ x = x + x_source
507
+ xs = None
508
+ for j in range(self.num_kernels):
509
+ if xs is None:
510
+ xs = self.resblocks[i * self.num_kernels + j](x)
511
+ else:
512
+ xs += self.resblocks[i * self.num_kernels + j](x)
513
+ x = xs / self.num_kernels
514
+ x = F.leaky_relu(x)
515
+ x = self.conv_post(x)
516
+ x = torch.tanh(x)
517
+ return x
518
+
519
+ def remove_weight_norm(self):
520
+ for l in self.ups:
521
+ remove_weight_norm(l)
522
+ for l in self.resblocks:
523
+ l.remove_weight_norm()
524
+
525
+
526
+ sr2sr = {
527
+ "32k": 32000,
528
+ "40k": 40000,
529
+ "48k": 48000,
530
+ }
531
+
532
+
533
+ class SynthesizerTrnMs256NSFsid(nn.Module):
534
+ def __init__(
535
+ self,
536
+ spec_channels,
537
+ segment_size,
538
+ inter_channels,
539
+ hidden_channels,
540
+ filter_channels,
541
+ n_heads,
542
+ n_layers,
543
+ kernel_size,
544
+ p_dropout,
545
+ resblock,
546
+ resblock_kernel_sizes,
547
+ resblock_dilation_sizes,
548
+ upsample_rates,
549
+ upsample_initial_channel,
550
+ upsample_kernel_sizes,
551
+ spk_embed_dim,
552
+ gin_channels,
553
+ sr,
554
+ **kwargs
555
+ ):
556
+ super().__init__()
557
+ if type(sr) == type("strr"):
558
+ sr = sr2sr[sr]
559
+ self.spec_channels = spec_channels
560
+ self.inter_channels = inter_channels
561
+ self.hidden_channels = hidden_channels
562
+ self.filter_channels = filter_channels
563
+ self.n_heads = n_heads
564
+ self.n_layers = n_layers
565
+ self.kernel_size = kernel_size
566
+ self.p_dropout = p_dropout
567
+ self.resblock = resblock
568
+ self.resblock_kernel_sizes = resblock_kernel_sizes
569
+ self.resblock_dilation_sizes = resblock_dilation_sizes
570
+ self.upsample_rates = upsample_rates
571
+ self.upsample_initial_channel = upsample_initial_channel
572
+ self.upsample_kernel_sizes = upsample_kernel_sizes
573
+ self.segment_size = segment_size
574
+ self.gin_channels = gin_channels
575
+ # self.hop_length = hop_length#
576
+ self.spk_embed_dim = spk_embed_dim
577
+ self.enc_p = TextEncoder256(
578
+ inter_channels,
579
+ hidden_channels,
580
+ filter_channels,
581
+ n_heads,
582
+ n_layers,
583
+ kernel_size,
584
+ p_dropout,
585
+ )
586
+ self.dec = GeneratorNSF(
587
+ inter_channels,
588
+ resblock,
589
+ resblock_kernel_sizes,
590
+ resblock_dilation_sizes,
591
+ upsample_rates,
592
+ upsample_initial_channel,
593
+ upsample_kernel_sizes,
594
+ gin_channels=gin_channels,
595
+ sr=sr,
596
+ is_half=kwargs["is_half"],
597
+ )
598
+ self.enc_q = PosteriorEncoder(
599
+ spec_channels,
600
+ inter_channels,
601
+ hidden_channels,
602
+ 5,
603
+ 1,
604
+ 16,
605
+ gin_channels=gin_channels,
606
+ )
607
+ self.flow = ResidualCouplingBlock(
608
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
609
+ )
610
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
611
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
612
+
613
+ def remove_weight_norm(self):
614
+ self.dec.remove_weight_norm()
615
+ self.flow.remove_weight_norm()
616
+ self.enc_q.remove_weight_norm()
617
+
618
+ def forward(
619
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
620
+ ): # 这里ds是id,[bs,1]
621
+ # print(1,pitch.shape)#[bs,t]
622
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
623
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
624
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
625
+ z_p = self.flow(z, y_mask, g=g)
626
+ z_slice, ids_slice = commons.rand_slice_segments(
627
+ z, y_lengths, self.segment_size
628
+ )
629
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
630
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
631
+ # print(-2,pitchf.shape,z_slice.shape)
632
+ o = self.dec(z_slice, pitchf, g=g)
633
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
634
+
635
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
636
+ g = self.emb_g(sid).unsqueeze(-1)
637
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
638
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
639
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
640
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
641
+ return o, x_mask, (z, z_p, m_p, logs_p)
642
+
643
+
644
+ class SynthesizerTrnMs768NSFsid(nn.Module):
645
+ def __init__(
646
+ self,
647
+ spec_channels,
648
+ segment_size,
649
+ inter_channels,
650
+ hidden_channels,
651
+ filter_channels,
652
+ n_heads,
653
+ n_layers,
654
+ kernel_size,
655
+ p_dropout,
656
+ resblock,
657
+ resblock_kernel_sizes,
658
+ resblock_dilation_sizes,
659
+ upsample_rates,
660
+ upsample_initial_channel,
661
+ upsample_kernel_sizes,
662
+ spk_embed_dim,
663
+ gin_channels,
664
+ sr,
665
+ **kwargs
666
+ ):
667
+ super().__init__()
668
+ if type(sr) == type("strr"):
669
+ sr = sr2sr[sr]
670
+ self.spec_channels = spec_channels
671
+ self.inter_channels = inter_channels
672
+ self.hidden_channels = hidden_channels
673
+ self.filter_channels = filter_channels
674
+ self.n_heads = n_heads
675
+ self.n_layers = n_layers
676
+ self.kernel_size = kernel_size
677
+ self.p_dropout = p_dropout
678
+ self.resblock = resblock
679
+ self.resblock_kernel_sizes = resblock_kernel_sizes
680
+ self.resblock_dilation_sizes = resblock_dilation_sizes
681
+ self.upsample_rates = upsample_rates
682
+ self.upsample_initial_channel = upsample_initial_channel
683
+ self.upsample_kernel_sizes = upsample_kernel_sizes
684
+ self.segment_size = segment_size
685
+ self.gin_channels = gin_channels
686
+ # self.hop_length = hop_length#
687
+ self.spk_embed_dim = spk_embed_dim
688
+ self.enc_p = TextEncoder768(
689
+ inter_channels,
690
+ hidden_channels,
691
+ filter_channels,
692
+ n_heads,
693
+ n_layers,
694
+ kernel_size,
695
+ p_dropout,
696
+ )
697
+ self.dec = GeneratorNSF(
698
+ inter_channels,
699
+ resblock,
700
+ resblock_kernel_sizes,
701
+ resblock_dilation_sizes,
702
+ upsample_rates,
703
+ upsample_initial_channel,
704
+ upsample_kernel_sizes,
705
+ gin_channels=gin_channels,
706
+ sr=sr,
707
+ is_half=kwargs["is_half"],
708
+ )
709
+ self.enc_q = PosteriorEncoder(
710
+ spec_channels,
711
+ inter_channels,
712
+ hidden_channels,
713
+ 5,
714
+ 1,
715
+ 16,
716
+ gin_channels=gin_channels,
717
+ )
718
+ self.flow = ResidualCouplingBlock(
719
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
720
+ )
721
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
722
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
723
+
724
+ def remove_weight_norm(self):
725
+ self.dec.remove_weight_norm()
726
+ self.flow.remove_weight_norm()
727
+ self.enc_q.remove_weight_norm()
728
+
729
+ def forward(
730
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
731
+ ): # 这里ds是id,[bs,1]
732
+ # print(1,pitch.shape)#[bs,t]
733
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
734
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
735
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
736
+ z_p = self.flow(z, y_mask, g=g)
737
+ z_slice, ids_slice = commons.rand_slice_segments(
738
+ z, y_lengths, self.segment_size
739
+ )
740
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
741
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
742
+ # print(-2,pitchf.shape,z_slice.shape)
743
+ o = self.dec(z_slice, pitchf, g=g)
744
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
745
+
746
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
747
+ g = self.emb_g(sid).unsqueeze(-1)
748
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
749
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
750
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
751
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
752
+ return o, x_mask, (z, z_p, m_p, logs_p)
753
+
754
+
755
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
756
+ def __init__(
757
+ self,
758
+ spec_channels,
759
+ segment_size,
760
+ inter_channels,
761
+ hidden_channels,
762
+ filter_channels,
763
+ n_heads,
764
+ n_layers,
765
+ kernel_size,
766
+ p_dropout,
767
+ resblock,
768
+ resblock_kernel_sizes,
769
+ resblock_dilation_sizes,
770
+ upsample_rates,
771
+ upsample_initial_channel,
772
+ upsample_kernel_sizes,
773
+ spk_embed_dim,
774
+ gin_channels,
775
+ sr=None,
776
+ **kwargs
777
+ ):
778
+ super().__init__()
779
+ self.spec_channels = spec_channels
780
+ self.inter_channels = inter_channels
781
+ self.hidden_channels = hidden_channels
782
+ self.filter_channels = filter_channels
783
+ self.n_heads = n_heads
784
+ self.n_layers = n_layers
785
+ self.kernel_size = kernel_size
786
+ self.p_dropout = p_dropout
787
+ self.resblock = resblock
788
+ self.resblock_kernel_sizes = resblock_kernel_sizes
789
+ self.resblock_dilation_sizes = resblock_dilation_sizes
790
+ self.upsample_rates = upsample_rates
791
+ self.upsample_initial_channel = upsample_initial_channel
792
+ self.upsample_kernel_sizes = upsample_kernel_sizes
793
+ self.segment_size = segment_size
794
+ self.gin_channels = gin_channels
795
+ # self.hop_length = hop_length#
796
+ self.spk_embed_dim = spk_embed_dim
797
+ self.enc_p = TextEncoder256(
798
+ inter_channels,
799
+ hidden_channels,
800
+ filter_channels,
801
+ n_heads,
802
+ n_layers,
803
+ kernel_size,
804
+ p_dropout,
805
+ f0=False,
806
+ )
807
+ self.dec = Generator(
808
+ inter_channels,
809
+ resblock,
810
+ resblock_kernel_sizes,
811
+ resblock_dilation_sizes,
812
+ upsample_rates,
813
+ upsample_initial_channel,
814
+ upsample_kernel_sizes,
815
+ gin_channels=gin_channels,
816
+ )
817
+ self.enc_q = PosteriorEncoder(
818
+ spec_channels,
819
+ inter_channels,
820
+ hidden_channels,
821
+ 5,
822
+ 1,
823
+ 16,
824
+ gin_channels=gin_channels,
825
+ )
826
+ self.flow = ResidualCouplingBlock(
827
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
828
+ )
829
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
830
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
831
+
832
+ def remove_weight_norm(self):
833
+ self.dec.remove_weight_norm()
834
+ self.flow.remove_weight_norm()
835
+ self.enc_q.remove_weight_norm()
836
+
837
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
838
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
839
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
840
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
841
+ z_p = self.flow(z, y_mask, g=g)
842
+ z_slice, ids_slice = commons.rand_slice_segments(
843
+ z, y_lengths, self.segment_size
844
+ )
845
+ o = self.dec(z_slice, g=g)
846
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
847
+
848
+ def infer(self, phone, phone_lengths, sid, max_len=None):
849
+ g = self.emb_g(sid).unsqueeze(-1)
850
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
851
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
852
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
853
+ o = self.dec((z * x_mask)[:, :, :max_len], g=g)
854
+ return o, x_mask, (z, z_p, m_p, logs_p)
855
+
856
+
857
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
858
+ def __init__(
859
+ self,
860
+ spec_channels,
861
+ segment_size,
862
+ inter_channels,
863
+ hidden_channels,
864
+ filter_channels,
865
+ n_heads,
866
+ n_layers,
867
+ kernel_size,
868
+ p_dropout,
869
+ resblock,
870
+ resblock_kernel_sizes,
871
+ resblock_dilation_sizes,
872
+ upsample_rates,
873
+ upsample_initial_channel,
874
+ upsample_kernel_sizes,
875
+ spk_embed_dim,
876
+ gin_channels,
877
+ sr=None,
878
+ **kwargs
879
+ ):
880
+ super().__init__()
881
+ self.spec_channels = spec_channels
882
+ self.inter_channels = inter_channels
883
+ self.hidden_channels = hidden_channels
884
+ self.filter_channels = filter_channels
885
+ self.n_heads = n_heads
886
+ self.n_layers = n_layers
887
+ self.kernel_size = kernel_size
888
+ self.p_dropout = p_dropout
889
+ self.resblock = resblock
890
+ self.resblock_kernel_sizes = resblock_kernel_sizes
891
+ self.resblock_dilation_sizes = resblock_dilation_sizes
892
+ self.upsample_rates = upsample_rates
893
+ self.upsample_initial_channel = upsample_initial_channel
894
+ self.upsample_kernel_sizes = upsample_kernel_sizes
895
+ self.segment_size = segment_size
896
+ self.gin_channels = gin_channels
897
+ # self.hop_length = hop_length#
898
+ self.spk_embed_dim = spk_embed_dim
899
+ self.enc_p = TextEncoder768(
900
+ inter_channels,
901
+ hidden_channels,
902
+ filter_channels,
903
+ n_heads,
904
+ n_layers,
905
+ kernel_size,
906
+ p_dropout,
907
+ f0=False,
908
+ )
909
+ self.dec = Generator(
910
+ inter_channels,
911
+ resblock,
912
+ resblock_kernel_sizes,
913
+ resblock_dilation_sizes,
914
+ upsample_rates,
915
+ upsample_initial_channel,
916
+ upsample_kernel_sizes,
917
+ gin_channels=gin_channels,
918
+ )
919
+ self.enc_q = PosteriorEncoder(
920
+ spec_channels,
921
+ inter_channels,
922
+ hidden_channels,
923
+ 5,
924
+ 1,
925
+ 16,
926
+ gin_channels=gin_channels,
927
+ )
928
+ self.flow = ResidualCouplingBlock(
929
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
930
+ )
931
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
932
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
933
+
934
+ def remove_weight_norm(self):
935
+ self.dec.remove_weight_norm()
936
+ self.flow.remove_weight_norm()
937
+ self.enc_q.remove_weight_norm()
938
+
939
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
940
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
941
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
942
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
943
+ z_p = self.flow(z, y_mask, g=g)
944
+ z_slice, ids_slice = commons.rand_slice_segments(
945
+ z, y_lengths, self.segment_size
946
+ )
947
+ o = self.dec(z_slice, g=g)
948
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
949
+
950
+ def infer(self, phone, phone_lengths, sid, max_len=None):
951
+ g = self.emb_g(sid).unsqueeze(-1)
952
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
953
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
954
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
955
+ o = self.dec((z * x_mask)[:, :, :max_len], g=g)
956
+ return o, x_mask, (z, z_p, m_p, logs_p)
957
+
958
+
959
+ class MultiPeriodDiscriminator(torch.nn.Module):
960
+ def __init__(self, use_spectral_norm=False):
961
+ super(MultiPeriodDiscriminator, self).__init__()
962
+ periods = [2, 3, 5, 7, 11, 17]
963
+ # periods = [3, 5, 7, 11, 17, 23, 37]
964
+
965
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
966
+ discs = discs + [
967
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
968
+ ]
969
+ self.discriminators = nn.ModuleList(discs)
970
+
971
+ def forward(self, y, y_hat):
972
+ y_d_rs = [] #
973
+ y_d_gs = []
974
+ fmap_rs = []
975
+ fmap_gs = []
976
+ for i, d in enumerate(self.discriminators):
977
+ y_d_r, fmap_r = d(y)
978
+ y_d_g, fmap_g = d(y_hat)
979
+ # for j in range(len(fmap_r)):
980
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
981
+ y_d_rs.append(y_d_r)
982
+ y_d_gs.append(y_d_g)
983
+ fmap_rs.append(fmap_r)
984
+ fmap_gs.append(fmap_g)
985
+
986
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
987
+
988
+
989
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
990
+ def __init__(self, use_spectral_norm=False):
991
+ super(MultiPeriodDiscriminatorV2, self).__init__()
992
+ # periods = [2, 3, 5, 7, 11, 17]
993
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
994
+
995
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
996
+ discs = discs + [
997
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
998
+ ]
999
+ self.discriminators = nn.ModuleList(discs)
1000
+
1001
+ def forward(self, y, y_hat):
1002
+ y_d_rs = [] #
1003
+ y_d_gs = []
1004
+ fmap_rs = []
1005
+ fmap_gs = []
1006
+ for i, d in enumerate(self.discriminators):
1007
+ y_d_r, fmap_r = d(y)
1008
+ y_d_g, fmap_g = d(y_hat)
1009
+ # for j in range(len(fmap_r)):
1010
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1011
+ y_d_rs.append(y_d_r)
1012
+ y_d_gs.append(y_d_g)
1013
+ fmap_rs.append(fmap_r)
1014
+ fmap_gs.append(fmap_g)
1015
+
1016
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1017
+
1018
+
1019
+ class DiscriminatorS(torch.nn.Module):
1020
+ def __init__(self, use_spectral_norm=False):
1021
+ super(DiscriminatorS, self).__init__()
1022
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1023
+ self.convs = nn.ModuleList(
1024
+ [
1025
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1026
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1027
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1028
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1029
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1030
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1031
+ ]
1032
+ )
1033
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1034
+
1035
+ def forward(self, x):
1036
+ fmap = []
1037
+
1038
+ for l in self.convs:
1039
+ x = l(x)
1040
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1041
+ fmap.append(x)
1042
+ x = self.conv_post(x)
1043
+ fmap.append(x)
1044
+ x = torch.flatten(x, 1, -1)
1045
+
1046
+ return x, fmap
1047
+
1048
+
1049
+ class DiscriminatorP(torch.nn.Module):
1050
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1051
+ super(DiscriminatorP, self).__init__()
1052
+ self.period = period
1053
+ self.use_spectral_norm = use_spectral_norm
1054
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1055
+ self.convs = nn.ModuleList(
1056
+ [
1057
+ norm_f(
1058
+ Conv2d(
1059
+ 1,
1060
+ 32,
1061
+ (kernel_size, 1),
1062
+ (stride, 1),
1063
+ padding=(get_padding(kernel_size, 1), 0),
1064
+ )
1065
+ ),
1066
+ norm_f(
1067
+ Conv2d(
1068
+ 32,
1069
+ 128,
1070
+ (kernel_size, 1),
1071
+ (stride, 1),
1072
+ padding=(get_padding(kernel_size, 1), 0),
1073
+ )
1074
+ ),
1075
+ norm_f(
1076
+ Conv2d(
1077
+ 128,
1078
+ 512,
1079
+ (kernel_size, 1),
1080
+ (stride, 1),
1081
+ padding=(get_padding(kernel_size, 1), 0),
1082
+ )
1083
+ ),
1084
+ norm_f(
1085
+ Conv2d(
1086
+ 512,
1087
+ 1024,
1088
+ (kernel_size, 1),
1089
+ (stride, 1),
1090
+ padding=(get_padding(kernel_size, 1), 0),
1091
+ )
1092
+ ),
1093
+ norm_f(
1094
+ Conv2d(
1095
+ 1024,
1096
+ 1024,
1097
+ (kernel_size, 1),
1098
+ 1,
1099
+ padding=(get_padding(kernel_size, 1), 0),
1100
+ )
1101
+ ),
1102
+ ]
1103
+ )
1104
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1105
+
1106
+ def forward(self, x):
1107
+ fmap = []
1108
+
1109
+ # 1d to 2d
1110
+ b, c, t = x.shape
1111
+ if t % self.period != 0: # pad first
1112
+ n_pad = self.period - (t % self.period)
1113
+ x = F.pad(x, (0, n_pad), "reflect")
1114
+ t = t + n_pad
1115
+ x = x.view(b, c, t // self.period, self.period)
1116
+
1117
+ for l in self.convs:
1118
+ x = l(x)
1119
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1120
+ fmap.append(x)
1121
+ x = self.conv_post(x)
1122
+ fmap.append(x)
1123
+ x = torch.flatten(x, 1, -1)
1124
+
1125
+ return x, fmap
lib/infer_pack/models_onnx.py CHANGED
@@ -1,819 +1,820 @@
1
- import math, pdb, os
2
- from time import time as ttime
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
- from lib.infer_pack import modules
7
- from lib.infer_pack import attentions
8
- from lib.infer_pack import commons
9
- from lib.infer_pack.commons import init_weights, get_padding
10
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from lib.infer_pack.commons import init_weights
13
- import numpy as np
14
- from lib.infer_pack import commons
15
-
16
-
17
- class TextEncoder256(nn.Module):
18
- def __init__(
19
- self,
20
- out_channels,
21
- hidden_channels,
22
- filter_channels,
23
- n_heads,
24
- n_layers,
25
- kernel_size,
26
- p_dropout,
27
- f0=True,
28
- ):
29
- super().__init__()
30
- self.out_channels = out_channels
31
- self.hidden_channels = hidden_channels
32
- self.filter_channels = filter_channels
33
- self.n_heads = n_heads
34
- self.n_layers = n_layers
35
- self.kernel_size = kernel_size
36
- self.p_dropout = p_dropout
37
- self.emb_phone = nn.Linear(256, hidden_channels)
38
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
- if f0 == True:
40
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
- self.encoder = attentions.Encoder(
42
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
- )
44
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
-
46
- def forward(self, phone, pitch, lengths):
47
- if pitch == None:
48
- x = self.emb_phone(phone)
49
- else:
50
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
- x = self.lrelu(x)
53
- x = torch.transpose(x, 1, -1) # [b, h, t]
54
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
- x.dtype
56
- )
57
- x = self.encoder(x * x_mask, x_mask)
58
- stats = self.proj(x) * x_mask
59
-
60
- m, logs = torch.split(stats, self.out_channels, dim=1)
61
- return m, logs, x_mask
62
-
63
-
64
- class TextEncoder768(nn.Module):
65
- def __init__(
66
- self,
67
- out_channels,
68
- hidden_channels,
69
- filter_channels,
70
- n_heads,
71
- n_layers,
72
- kernel_size,
73
- p_dropout,
74
- f0=True,
75
- ):
76
- super().__init__()
77
- self.out_channels = out_channels
78
- self.hidden_channels = hidden_channels
79
- self.filter_channels = filter_channels
80
- self.n_heads = n_heads
81
- self.n_layers = n_layers
82
- self.kernel_size = kernel_size
83
- self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(768, hidden_channels)
85
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
- if f0 == True:
87
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
- self.encoder = attentions.Encoder(
89
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
- )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
-
93
- def forward(self, phone, pitch, lengths):
94
- if pitch == None:
95
- x = self.emb_phone(phone)
96
- else:
97
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
- x = self.lrelu(x)
100
- x = torch.transpose(x, 1, -1) # [b, h, t]
101
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
- x.dtype
103
- )
104
- x = self.encoder(x * x_mask, x_mask)
105
- stats = self.proj(x) * x_mask
106
-
107
- m, logs = torch.split(stats, self.out_channels, dim=1)
108
- return m, logs, x_mask
109
-
110
-
111
- class ResidualCouplingBlock(nn.Module):
112
- def __init__(
113
- self,
114
- channels,
115
- hidden_channels,
116
- kernel_size,
117
- dilation_rate,
118
- n_layers,
119
- n_flows=4,
120
- gin_channels=0,
121
- ):
122
- super().__init__()
123
- self.channels = channels
124
- self.hidden_channels = hidden_channels
125
- self.kernel_size = kernel_size
126
- self.dilation_rate = dilation_rate
127
- self.n_layers = n_layers
128
- self.n_flows = n_flows
129
- self.gin_channels = gin_channels
130
-
131
- self.flows = nn.ModuleList()
132
- for i in range(n_flows):
133
- self.flows.append(
134
- modules.ResidualCouplingLayer(
135
- channels,
136
- hidden_channels,
137
- kernel_size,
138
- dilation_rate,
139
- n_layers,
140
- gin_channels=gin_channels,
141
- mean_only=True,
142
- )
143
- )
144
- self.flows.append(modules.Flip())
145
-
146
- def forward(self, x, x_mask, g=None, reverse=False):
147
- if not reverse:
148
- for flow in self.flows:
149
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
- else:
151
- for flow in reversed(self.flows):
152
- x = flow(x, x_mask, g=g, reverse=reverse)
153
- return x
154
-
155
- def remove_weight_norm(self):
156
- for i in range(self.n_flows):
157
- self.flows[i * 2].remove_weight_norm()
158
-
159
-
160
- class PosteriorEncoder(nn.Module):
161
- def __init__(
162
- self,
163
- in_channels,
164
- out_channels,
165
- hidden_channels,
166
- kernel_size,
167
- dilation_rate,
168
- n_layers,
169
- gin_channels=0,
170
- ):
171
- super().__init__()
172
- self.in_channels = in_channels
173
- self.out_channels = out_channels
174
- self.hidden_channels = hidden_channels
175
- self.kernel_size = kernel_size
176
- self.dilation_rate = dilation_rate
177
- self.n_layers = n_layers
178
- self.gin_channels = gin_channels
179
-
180
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
- self.enc = modules.WN(
182
- hidden_channels,
183
- kernel_size,
184
- dilation_rate,
185
- n_layers,
186
- gin_channels=gin_channels,
187
- )
188
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
-
190
- def forward(self, x, x_lengths, g=None):
191
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
- x.dtype
193
- )
194
- x = self.pre(x) * x_mask
195
- x = self.enc(x, x_mask, g=g)
196
- stats = self.proj(x) * x_mask
197
- m, logs = torch.split(stats, self.out_channels, dim=1)
198
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
- return z, m, logs, x_mask
200
-
201
- def remove_weight_norm(self):
202
- self.enc.remove_weight_norm()
203
-
204
-
205
- class Generator(torch.nn.Module):
206
- def __init__(
207
- self,
208
- initial_channel,
209
- resblock,
210
- resblock_kernel_sizes,
211
- resblock_dilation_sizes,
212
- upsample_rates,
213
- upsample_initial_channel,
214
- upsample_kernel_sizes,
215
- gin_channels=0,
216
- ):
217
- super(Generator, self).__init__()
218
- self.num_kernels = len(resblock_kernel_sizes)
219
- self.num_upsamples = len(upsample_rates)
220
- self.conv_pre = Conv1d(
221
- initial_channel, upsample_initial_channel, 7, 1, padding=3
222
- )
223
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
-
225
- self.ups = nn.ModuleList()
226
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
- self.ups.append(
228
- weight_norm(
229
- ConvTranspose1d(
230
- upsample_initial_channel // (2**i),
231
- upsample_initial_channel // (2 ** (i + 1)),
232
- k,
233
- u,
234
- padding=(k - u) // 2,
235
- )
236
- )
237
- )
238
-
239
- self.resblocks = nn.ModuleList()
240
- for i in range(len(self.ups)):
241
- ch = upsample_initial_channel // (2 ** (i + 1))
242
- for j, (k, d) in enumerate(
243
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
- ):
245
- self.resblocks.append(resblock(ch, k, d))
246
-
247
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
- self.ups.apply(init_weights)
249
-
250
- if gin_channels != 0:
251
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
-
253
- def forward(self, x, g=None):
254
- x = self.conv_pre(x)
255
- if g is not None:
256
- x = x + self.cond(g)
257
-
258
- for i in range(self.num_upsamples):
259
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
- x = self.ups[i](x)
261
- xs = None
262
- for j in range(self.num_kernels):
263
- if xs is None:
264
- xs = self.resblocks[i * self.num_kernels + j](x)
265
- else:
266
- xs += self.resblocks[i * self.num_kernels + j](x)
267
- x = xs / self.num_kernels
268
- x = F.leaky_relu(x)
269
- x = self.conv_post(x)
270
- x = torch.tanh(x)
271
-
272
- return x
273
-
274
- def remove_weight_norm(self):
275
- for l in self.ups:
276
- remove_weight_norm(l)
277
- for l in self.resblocks:
278
- l.remove_weight_norm()
279
-
280
-
281
- class SineGen(torch.nn.Module):
282
- """Definition of sine generator
283
- SineGen(samp_rate, harmonic_num = 0,
284
- sine_amp = 0.1, noise_std = 0.003,
285
- voiced_threshold = 0,
286
- flag_for_pulse=False)
287
- samp_rate: sampling rate in Hz
288
- harmonic_num: number of harmonic overtones (default 0)
289
- sine_amp: amplitude of sine-wavefrom (default 0.1)
290
- noise_std: std of Gaussian noise (default 0.003)
291
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
- Note: when flag_for_pulse is True, the first time step of a voiced
294
- segment is always sin(np.pi) or cos(0)
295
- """
296
-
297
- def __init__(
298
- self,
299
- samp_rate,
300
- harmonic_num=0,
301
- sine_amp=0.1,
302
- noise_std=0.003,
303
- voiced_threshold=0,
304
- flag_for_pulse=False,
305
- ):
306
- super(SineGen, self).__init__()
307
- self.sine_amp = sine_amp
308
- self.noise_std = noise_std
309
- self.harmonic_num = harmonic_num
310
- self.dim = self.harmonic_num + 1
311
- self.sampling_rate = samp_rate
312
- self.voiced_threshold = voiced_threshold
313
-
314
- def _f02uv(self, f0):
315
- # generate uv signal
316
- uv = torch.ones_like(f0)
317
- uv = uv * (f0 > self.voiced_threshold)
318
- return uv
319
-
320
- def forward(self, f0, upp):
321
- """sine_tensor, uv = forward(f0)
322
- input F0: tensor(batchsize=1, length, dim=1)
323
- f0 for unvoiced steps should be 0
324
- output sine_tensor: tensor(batchsize=1, length, dim)
325
- output uv: tensor(batchsize=1, length, 1)
326
- """
327
- with torch.no_grad():
328
- f0 = f0[:, None].transpose(1, 2)
329
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
- # fundamental component
331
- f0_buf[:, :, 0] = f0[:, :, 0]
332
- for idx in np.arange(self.harmonic_num):
333
- f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
- idx + 2
335
- ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
- rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
337
- rand_ini = torch.rand(
338
- f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
- )
340
- rand_ini[:, 0] = 0
341
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
- tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
343
- tmp_over_one *= upp
344
- tmp_over_one = F.interpolate(
345
- tmp_over_one.transpose(2, 1),
346
- scale_factor=upp,
347
- mode="linear",
348
- align_corners=True,
349
- ).transpose(2, 1)
350
- rad_values = F.interpolate(
351
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
- ).transpose(
353
- 2, 1
354
- ) #######
355
- tmp_over_one %= 1
356
- tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
- cumsum_shift = torch.zeros_like(rad_values)
358
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
- sine_waves = torch.sin(
360
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
- )
362
- sine_waves = sine_waves * self.sine_amp
363
- uv = self._f02uv(f0)
364
- uv = F.interpolate(
365
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
- ).transpose(2, 1)
367
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
- noise = noise_amp * torch.randn_like(sine_waves)
369
- sine_waves = sine_waves * uv + noise
370
- return sine_waves, uv, noise
371
-
372
-
373
- class SourceModuleHnNSF(torch.nn.Module):
374
- """SourceModule for hn-nsf
375
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
- add_noise_std=0.003, voiced_threshod=0)
377
- sampling_rate: sampling_rate in Hz
378
- harmonic_num: number of harmonic above F0 (default: 0)
379
- sine_amp: amplitude of sine source signal (default: 0.1)
380
- add_noise_std: std of additive Gaussian noise (default: 0.003)
381
- note that amplitude of noise in unvoiced is decided
382
- by sine_amp
383
- voiced_threshold: threhold to set U/V given F0 (default: 0)
384
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
- F0_sampled (batchsize, length, 1)
386
- Sine_source (batchsize, length, 1)
387
- noise_source (batchsize, length 1)
388
- uv (batchsize, length, 1)
389
- """
390
-
391
- def __init__(
392
- self,
393
- sampling_rate,
394
- harmonic_num=0,
395
- sine_amp=0.1,
396
- add_noise_std=0.003,
397
- voiced_threshod=0,
398
- is_half=True,
399
- ):
400
- super(SourceModuleHnNSF, self).__init__()
401
-
402
- self.sine_amp = sine_amp
403
- self.noise_std = add_noise_std
404
- self.is_half = is_half
405
- # to produce sine waveforms
406
- self.l_sin_gen = SineGen(
407
- sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
- )
409
-
410
- # to merge source harmonics into a single excitation
411
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
- self.l_tanh = torch.nn.Tanh()
413
-
414
- def forward(self, x, upp=None):
415
- sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
- if self.is_half:
417
- sine_wavs = sine_wavs.half()
418
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
- return sine_merge, None, None # noise, uv
420
-
421
-
422
- class GeneratorNSF(torch.nn.Module):
423
- def __init__(
424
- self,
425
- initial_channel,
426
- resblock,
427
- resblock_kernel_sizes,
428
- resblock_dilation_sizes,
429
- upsample_rates,
430
- upsample_initial_channel,
431
- upsample_kernel_sizes,
432
- gin_channels,
433
- sr,
434
- is_half=False,
435
- ):
436
- super(GeneratorNSF, self).__init__()
437
- self.num_kernels = len(resblock_kernel_sizes)
438
- self.num_upsamples = len(upsample_rates)
439
-
440
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
- self.m_source = SourceModuleHnNSF(
442
- sampling_rate=sr, harmonic_num=0, is_half=is_half
443
- )
444
- self.noise_convs = nn.ModuleList()
445
- self.conv_pre = Conv1d(
446
- initial_channel, upsample_initial_channel, 7, 1, padding=3
447
- )
448
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
-
450
- self.ups = nn.ModuleList()
451
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
- c_cur = upsample_initial_channel // (2 ** (i + 1))
453
- self.ups.append(
454
- weight_norm(
455
- ConvTranspose1d(
456
- upsample_initial_channel // (2**i),
457
- upsample_initial_channel // (2 ** (i + 1)),
458
- k,
459
- u,
460
- padding=(k - u) // 2,
461
- )
462
- )
463
- )
464
- if i + 1 < len(upsample_rates):
465
- stride_f0 = np.prod(upsample_rates[i + 1 :])
466
- self.noise_convs.append(
467
- Conv1d(
468
- 1,
469
- c_cur,
470
- kernel_size=stride_f0 * 2,
471
- stride=stride_f0,
472
- padding=stride_f0 // 2,
473
- )
474
- )
475
- else:
476
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
-
478
- self.resblocks = nn.ModuleList()
479
- for i in range(len(self.ups)):
480
- ch = upsample_initial_channel // (2 ** (i + 1))
481
- for j, (k, d) in enumerate(
482
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
- ):
484
- self.resblocks.append(resblock(ch, k, d))
485
-
486
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
- self.ups.apply(init_weights)
488
-
489
- if gin_channels != 0:
490
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
-
492
- self.upp = np.prod(upsample_rates)
493
-
494
- def forward(self, x, f0, g=None):
495
- har_source, noi_source, uv = self.m_source(f0, self.upp)
496
- har_source = har_source.transpose(1, 2)
497
- x = self.conv_pre(x)
498
- if g is not None:
499
- x = x + self.cond(g)
500
-
501
- for i in range(self.num_upsamples):
502
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
- x = self.ups[i](x)
504
- x_source = self.noise_convs[i](har_source)
505
- x = x + x_source
506
- xs = None
507
- for j in range(self.num_kernels):
508
- if xs is None:
509
- xs = self.resblocks[i * self.num_kernels + j](x)
510
- else:
511
- xs += self.resblocks[i * self.num_kernels + j](x)
512
- x = xs / self.num_kernels
513
- x = F.leaky_relu(x)
514
- x = self.conv_post(x)
515
- x = torch.tanh(x)
516
- return x
517
-
518
- def remove_weight_norm(self):
519
- for l in self.ups:
520
- remove_weight_norm(l)
521
- for l in self.resblocks:
522
- l.remove_weight_norm()
523
-
524
-
525
- sr2sr = {
526
- "32k": 32000,
527
- "40k": 40000,
528
- "48k": 48000,
529
- }
530
-
531
-
532
- class SynthesizerTrnMsNSFsidM(nn.Module):
533
- def __init__(
534
- self,
535
- spec_channels,
536
- segment_size,
537
- inter_channels,
538
- hidden_channels,
539
- filter_channels,
540
- n_heads,
541
- n_layers,
542
- kernel_size,
543
- p_dropout,
544
- resblock,
545
- resblock_kernel_sizes,
546
- resblock_dilation_sizes,
547
- upsample_rates,
548
- upsample_initial_channel,
549
- upsample_kernel_sizes,
550
- spk_embed_dim,
551
- gin_channels,
552
- sr,
553
- version,
554
- **kwargs
555
- ):
556
- super().__init__()
557
- if type(sr) == type("strr"):
558
- sr = sr2sr[sr]
559
- self.spec_channels = spec_channels
560
- self.inter_channels = inter_channels
561
- self.hidden_channels = hidden_channels
562
- self.filter_channels = filter_channels
563
- self.n_heads = n_heads
564
- self.n_layers = n_layers
565
- self.kernel_size = kernel_size
566
- self.p_dropout = p_dropout
567
- self.resblock = resblock
568
- self.resblock_kernel_sizes = resblock_kernel_sizes
569
- self.resblock_dilation_sizes = resblock_dilation_sizes
570
- self.upsample_rates = upsample_rates
571
- self.upsample_initial_channel = upsample_initial_channel
572
- self.upsample_kernel_sizes = upsample_kernel_sizes
573
- self.segment_size = segment_size
574
- self.gin_channels = gin_channels
575
- # self.hop_length = hop_length#
576
- self.spk_embed_dim = spk_embed_dim
577
- if version == "v1":
578
- self.enc_p = TextEncoder256(
579
- inter_channels,
580
- hidden_channels,
581
- filter_channels,
582
- n_heads,
583
- n_layers,
584
- kernel_size,
585
- p_dropout,
586
- )
587
- else:
588
- self.enc_p = TextEncoder768(
589
- inter_channels,
590
- hidden_channels,
591
- filter_channels,
592
- n_heads,
593
- n_layers,
594
- kernel_size,
595
- p_dropout,
596
- )
597
- self.dec = GeneratorNSF(
598
- inter_channels,
599
- resblock,
600
- resblock_kernel_sizes,
601
- resblock_dilation_sizes,
602
- upsample_rates,
603
- upsample_initial_channel,
604
- upsample_kernel_sizes,
605
- gin_channels=gin_channels,
606
- sr=sr,
607
- is_half=kwargs["is_half"],
608
- )
609
- self.enc_q = PosteriorEncoder(
610
- spec_channels,
611
- inter_channels,
612
- hidden_channels,
613
- 5,
614
- 1,
615
- 16,
616
- gin_channels=gin_channels,
617
- )
618
- self.flow = ResidualCouplingBlock(
619
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
620
- )
621
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
622
- self.speaker_map = None
623
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
624
-
625
- def remove_weight_norm(self):
626
- self.dec.remove_weight_norm()
627
- self.flow.remove_weight_norm()
628
- self.enc_q.remove_weight_norm()
629
-
630
- def construct_spkmixmap(self, n_speaker):
631
- self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
632
- for i in range(n_speaker):
633
- self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
634
- self.speaker_map = self.speaker_map.unsqueeze(0)
635
-
636
- def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
637
- if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
638
- g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
639
- g = g * self.speaker_map # [N, S, B, 1, H]
640
- g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
641
- g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
642
- else:
643
- g = g.unsqueeze(0)
644
- g = self.emb_g(g).transpose(1, 2)
645
-
646
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
647
- z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
648
- z = self.flow(z_p, x_mask, g=g, reverse=True)
649
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
650
- return o
651
-
652
-
653
- class MultiPeriodDiscriminator(torch.nn.Module):
654
- def __init__(self, use_spectral_norm=False):
655
- super(MultiPeriodDiscriminator, self).__init__()
656
- periods = [2, 3, 5, 7, 11, 17]
657
- # periods = [3, 5, 7, 11, 17, 23, 37]
658
-
659
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
660
- discs = discs + [
661
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
662
- ]
663
- self.discriminators = nn.ModuleList(discs)
664
-
665
- def forward(self, y, y_hat):
666
- y_d_rs = [] #
667
- y_d_gs = []
668
- fmap_rs = []
669
- fmap_gs = []
670
- for i, d in enumerate(self.discriminators):
671
- y_d_r, fmap_r = d(y)
672
- y_d_g, fmap_g = d(y_hat)
673
- # for j in range(len(fmap_r)):
674
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
675
- y_d_rs.append(y_d_r)
676
- y_d_gs.append(y_d_g)
677
- fmap_rs.append(fmap_r)
678
- fmap_gs.append(fmap_g)
679
-
680
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
681
-
682
-
683
- class MultiPeriodDiscriminatorV2(torch.nn.Module):
684
- def __init__(self, use_spectral_norm=False):
685
- super(MultiPeriodDiscriminatorV2, self).__init__()
686
- # periods = [2, 3, 5, 7, 11, 17]
687
- periods = [2, 3, 5, 7, 11, 17, 23, 37]
688
-
689
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
690
- discs = discs + [
691
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
692
- ]
693
- self.discriminators = nn.ModuleList(discs)
694
-
695
- def forward(self, y, y_hat):
696
- y_d_rs = [] #
697
- y_d_gs = []
698
- fmap_rs = []
699
- fmap_gs = []
700
- for i, d in enumerate(self.discriminators):
701
- y_d_r, fmap_r = d(y)
702
- y_d_g, fmap_g = d(y_hat)
703
- # for j in range(len(fmap_r)):
704
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
705
- y_d_rs.append(y_d_r)
706
- y_d_gs.append(y_d_g)
707
- fmap_rs.append(fmap_r)
708
- fmap_gs.append(fmap_g)
709
-
710
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
711
-
712
-
713
- class DiscriminatorS(torch.nn.Module):
714
- def __init__(self, use_spectral_norm=False):
715
- super(DiscriminatorS, self).__init__()
716
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
717
- self.convs = nn.ModuleList(
718
- [
719
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
720
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
721
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
722
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
723
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
724
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
725
- ]
726
- )
727
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
728
-
729
- def forward(self, x):
730
- fmap = []
731
-
732
- for l in self.convs:
733
- x = l(x)
734
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
735
- fmap.append(x)
736
- x = self.conv_post(x)
737
- fmap.append(x)
738
- x = torch.flatten(x, 1, -1)
739
-
740
- return x, fmap
741
-
742
-
743
- class DiscriminatorP(torch.nn.Module):
744
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
745
- super(DiscriminatorP, self).__init__()
746
- self.period = period
747
- self.use_spectral_norm = use_spectral_norm
748
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
749
- self.convs = nn.ModuleList(
750
- [
751
- norm_f(
752
- Conv2d(
753
- 1,
754
- 32,
755
- (kernel_size, 1),
756
- (stride, 1),
757
- padding=(get_padding(kernel_size, 1), 0),
758
- )
759
- ),
760
- norm_f(
761
- Conv2d(
762
- 32,
763
- 128,
764
- (kernel_size, 1),
765
- (stride, 1),
766
- padding=(get_padding(kernel_size, 1), 0),
767
- )
768
- ),
769
- norm_f(
770
- Conv2d(
771
- 128,
772
- 512,
773
- (kernel_size, 1),
774
- (stride, 1),
775
- padding=(get_padding(kernel_size, 1), 0),
776
- )
777
- ),
778
- norm_f(
779
- Conv2d(
780
- 512,
781
- 1024,
782
- (kernel_size, 1),
783
- (stride, 1),
784
- padding=(get_padding(kernel_size, 1), 0),
785
- )
786
- ),
787
- norm_f(
788
- Conv2d(
789
- 1024,
790
- 1024,
791
- (kernel_size, 1),
792
- 1,
793
- padding=(get_padding(kernel_size, 1), 0),
794
- )
795
- ),
796
- ]
797
- )
798
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
799
-
800
- def forward(self, x):
801
- fmap = []
802
-
803
- # 1d to 2d
804
- b, c, t = x.shape
805
- if t % self.period != 0: # pad first
806
- n_pad = self.period - (t % self.period)
807
- x = F.pad(x, (0, n_pad), "reflect")
808
- t = t + n_pad
809
- x = x.view(b, c, t // self.period, self.period)
810
-
811
- for l in self.convs:
812
- x = l(x)
813
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
814
- fmap.append(x)
815
- x = self.conv_post(x)
816
- fmap.append(x)
817
- x = torch.flatten(x, 1, -1)
818
-
819
- return x, fmap
 
 
1
+ import math, pdb, os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from lib.infer_pack import modules
7
+ from lib.infer_pack import attentions
8
+ from lib.infer_pack import commons
9
+ from lib.infer_pack.commons import init_weights, get_padding
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import remove_weight_norm
12
+ from torch.nn.utils.parametrizations import spectral_norm, weight_norm
13
+ from lib.infer_pack.commons import init_weights
14
+ import numpy as np
15
+ from lib.infer_pack import commons
16
+
17
+
18
+ class TextEncoder256(nn.Module):
19
+ def __init__(
20
+ self,
21
+ out_channels,
22
+ hidden_channels,
23
+ filter_channels,
24
+ n_heads,
25
+ n_layers,
26
+ kernel_size,
27
+ p_dropout,
28
+ f0=True,
29
+ ):
30
+ super().__init__()
31
+ self.out_channels = out_channels
32
+ self.hidden_channels = hidden_channels
33
+ self.filter_channels = filter_channels
34
+ self.n_heads = n_heads
35
+ self.n_layers = n_layers
36
+ self.kernel_size = kernel_size
37
+ self.p_dropout = p_dropout
38
+ self.emb_phone = nn.Linear(256, hidden_channels)
39
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
40
+ if f0 == True:
41
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
42
+ self.encoder = attentions.Encoder(
43
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
44
+ )
45
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
46
+
47
+ def forward(self, phone, pitch, lengths):
48
+ if pitch == None:
49
+ x = self.emb_phone(phone)
50
+ else:
51
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
52
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
53
+ x = self.lrelu(x)
54
+ x = torch.transpose(x, 1, -1) # [b, h, t]
55
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
56
+ x.dtype
57
+ )
58
+ x = self.encoder(x * x_mask, x_mask)
59
+ stats = self.proj(x) * x_mask
60
+
61
+ m, logs = torch.split(stats, self.out_channels, dim=1)
62
+ return m, logs, x_mask
63
+
64
+
65
+ class TextEncoder768(nn.Module):
66
+ def __init__(
67
+ self,
68
+ out_channels,
69
+ hidden_channels,
70
+ filter_channels,
71
+ n_heads,
72
+ n_layers,
73
+ kernel_size,
74
+ p_dropout,
75
+ f0=True,
76
+ ):
77
+ super().__init__()
78
+ self.out_channels = out_channels
79
+ self.hidden_channels = hidden_channels
80
+ self.filter_channels = filter_channels
81
+ self.n_heads = n_heads
82
+ self.n_layers = n_layers
83
+ self.kernel_size = kernel_size
84
+ self.p_dropout = p_dropout
85
+ self.emb_phone = nn.Linear(768, hidden_channels)
86
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
87
+ if f0 == True:
88
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
89
+ self.encoder = attentions.Encoder(
90
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
91
+ )
92
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
93
+
94
+ def forward(self, phone, pitch, lengths):
95
+ if pitch == None:
96
+ x = self.emb_phone(phone)
97
+ else:
98
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
99
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
100
+ x = self.lrelu(x)
101
+ x = torch.transpose(x, 1, -1) # [b, h, t]
102
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
103
+ x.dtype
104
+ )
105
+ x = self.encoder(x * x_mask, x_mask)
106
+ stats = self.proj(x) * x_mask
107
+
108
+ m, logs = torch.split(stats, self.out_channels, dim=1)
109
+ return m, logs, x_mask
110
+
111
+
112
+ class ResidualCouplingBlock(nn.Module):
113
+ def __init__(
114
+ self,
115
+ channels,
116
+ hidden_channels,
117
+ kernel_size,
118
+ dilation_rate,
119
+ n_layers,
120
+ n_flows=4,
121
+ gin_channels=0,
122
+ ):
123
+ super().__init__()
124
+ self.channels = channels
125
+ self.hidden_channels = hidden_channels
126
+ self.kernel_size = kernel_size
127
+ self.dilation_rate = dilation_rate
128
+ self.n_layers = n_layers
129
+ self.n_flows = n_flows
130
+ self.gin_channels = gin_channels
131
+
132
+ self.flows = nn.ModuleList()
133
+ for i in range(n_flows):
134
+ self.flows.append(
135
+ modules.ResidualCouplingLayer(
136
+ channels,
137
+ hidden_channels,
138
+ kernel_size,
139
+ dilation_rate,
140
+ n_layers,
141
+ gin_channels=gin_channels,
142
+ mean_only=True,
143
+ )
144
+ )
145
+ self.flows.append(modules.Flip())
146
+
147
+ def forward(self, x, x_mask, g=None, reverse=False):
148
+ if not reverse:
149
+ for flow in self.flows:
150
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
151
+ else:
152
+ for flow in reversed(self.flows):
153
+ x = flow(x, x_mask, g=g, reverse=reverse)
154
+ return x
155
+
156
+ def remove_weight_norm(self):
157
+ for i in range(self.n_flows):
158
+ self.flows[i * 2].remove_weight_norm()
159
+
160
+
161
+ class PosteriorEncoder(nn.Module):
162
+ def __init__(
163
+ self,
164
+ in_channels,
165
+ out_channels,
166
+ hidden_channels,
167
+ kernel_size,
168
+ dilation_rate,
169
+ n_layers,
170
+ gin_channels=0,
171
+ ):
172
+ super().__init__()
173
+ self.in_channels = in_channels
174
+ self.out_channels = out_channels
175
+ self.hidden_channels = hidden_channels
176
+ self.kernel_size = kernel_size
177
+ self.dilation_rate = dilation_rate
178
+ self.n_layers = n_layers
179
+ self.gin_channels = gin_channels
180
+
181
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
182
+ self.enc = modules.WN(
183
+ hidden_channels,
184
+ kernel_size,
185
+ dilation_rate,
186
+ n_layers,
187
+ gin_channels=gin_channels,
188
+ )
189
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
190
+
191
+ def forward(self, x, x_lengths, g=None):
192
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
193
+ x.dtype
194
+ )
195
+ x = self.pre(x) * x_mask
196
+ x = self.enc(x, x_mask, g=g)
197
+ stats = self.proj(x) * x_mask
198
+ m, logs = torch.split(stats, self.out_channels, dim=1)
199
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
200
+ return z, m, logs, x_mask
201
+
202
+ def remove_weight_norm(self):
203
+ self.enc.remove_weight_norm()
204
+
205
+
206
+ class Generator(torch.nn.Module):
207
+ def __init__(
208
+ self,
209
+ initial_channel,
210
+ resblock,
211
+ resblock_kernel_sizes,
212
+ resblock_dilation_sizes,
213
+ upsample_rates,
214
+ upsample_initial_channel,
215
+ upsample_kernel_sizes,
216
+ gin_channels=0,
217
+ ):
218
+ super(Generator, self).__init__()
219
+ self.num_kernels = len(resblock_kernel_sizes)
220
+ self.num_upsamples = len(upsample_rates)
221
+ self.conv_pre = Conv1d(
222
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
223
+ )
224
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
225
+
226
+ self.ups = nn.ModuleList()
227
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
228
+ self.ups.append(
229
+ weight_norm(
230
+ ConvTranspose1d(
231
+ upsample_initial_channel // (2**i),
232
+ upsample_initial_channel // (2 ** (i + 1)),
233
+ k,
234
+ u,
235
+ padding=(k - u) // 2,
236
+ )
237
+ )
238
+ )
239
+
240
+ self.resblocks = nn.ModuleList()
241
+ for i in range(len(self.ups)):
242
+ ch = upsample_initial_channel // (2 ** (i + 1))
243
+ for j, (k, d) in enumerate(
244
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
245
+ ):
246
+ self.resblocks.append(resblock(ch, k, d))
247
+
248
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
249
+ self.ups.apply(init_weights)
250
+
251
+ if gin_channels != 0:
252
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
253
+
254
+ def forward(self, x, g=None):
255
+ x = self.conv_pre(x)
256
+ if g is not None:
257
+ x = x + self.cond(g)
258
+
259
+ for i in range(self.num_upsamples):
260
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
261
+ x = self.ups[i](x)
262
+ xs = None
263
+ for j in range(self.num_kernels):
264
+ if xs is None:
265
+ xs = self.resblocks[i * self.num_kernels + j](x)
266
+ else:
267
+ xs += self.resblocks[i * self.num_kernels + j](x)
268
+ x = xs / self.num_kernels
269
+ x = F.leaky_relu(x)
270
+ x = self.conv_post(x)
271
+ x = torch.tanh(x)
272
+
273
+ return x
274
+
275
+ def remove_weight_norm(self):
276
+ for l in self.ups:
277
+ remove_weight_norm(l)
278
+ for l in self.resblocks:
279
+ l.remove_weight_norm()
280
+
281
+
282
+ class SineGen(torch.nn.Module):
283
+ """Definition of sine generator
284
+ SineGen(samp_rate, harmonic_num = 0,
285
+ sine_amp = 0.1, noise_std = 0.003,
286
+ voiced_threshold = 0,
287
+ flag_for_pulse=False)
288
+ samp_rate: sampling rate in Hz
289
+ harmonic_num: number of harmonic overtones (default 0)
290
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
291
+ noise_std: std of Gaussian noise (default 0.003)
292
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
293
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
294
+ Note: when flag_for_pulse is True, the first time step of a voiced
295
+ segment is always sin(np.pi) or cos(0)
296
+ """
297
+
298
+ def __init__(
299
+ self,
300
+ samp_rate,
301
+ harmonic_num=0,
302
+ sine_amp=0.1,
303
+ noise_std=0.003,
304
+ voiced_threshold=0,
305
+ flag_for_pulse=False,
306
+ ):
307
+ super(SineGen, self).__init__()
308
+ self.sine_amp = sine_amp
309
+ self.noise_std = noise_std
310
+ self.harmonic_num = harmonic_num
311
+ self.dim = self.harmonic_num + 1
312
+ self.sampling_rate = samp_rate
313
+ self.voiced_threshold = voiced_threshold
314
+
315
+ def _f02uv(self, f0):
316
+ # generate uv signal
317
+ uv = torch.ones_like(f0)
318
+ uv = uv * (f0 > self.voiced_threshold)
319
+ return uv
320
+
321
+ def forward(self, f0, upp):
322
+ """sine_tensor, uv = forward(f0)
323
+ input F0: tensor(batchsize=1, length, dim=1)
324
+ f0 for unvoiced steps should be 0
325
+ output sine_tensor: tensor(batchsize=1, length, dim)
326
+ output uv: tensor(batchsize=1, length, 1)
327
+ """
328
+ with torch.no_grad():
329
+ f0 = f0[:, None].transpose(1, 2)
330
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
331
+ # fundamental component
332
+ f0_buf[:, :, 0] = f0[:, :, 0]
333
+ for idx in np.arange(self.harmonic_num):
334
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
335
+ idx + 2
336
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
337
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
338
+ rand_ini = torch.rand(
339
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
340
+ )
341
+ rand_ini[:, 0] = 0
342
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
343
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
344
+ tmp_over_one *= upp
345
+ tmp_over_one = F.interpolate(
346
+ tmp_over_one.transpose(2, 1),
347
+ scale_factor=upp,
348
+ mode="linear",
349
+ align_corners=True,
350
+ ).transpose(2, 1)
351
+ rad_values = F.interpolate(
352
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
353
+ ).transpose(
354
+ 2, 1
355
+ ) #######
356
+ tmp_over_one %= 1
357
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
358
+ cumsum_shift = torch.zeros_like(rad_values)
359
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
360
+ sine_waves = torch.sin(
361
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
362
+ )
363
+ sine_waves = sine_waves * self.sine_amp
364
+ uv = self._f02uv(f0)
365
+ uv = F.interpolate(
366
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
367
+ ).transpose(2, 1)
368
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
369
+ noise = noise_amp * torch.randn_like(sine_waves)
370
+ sine_waves = sine_waves * uv + noise
371
+ return sine_waves, uv, noise
372
+
373
+
374
+ class SourceModuleHnNSF(torch.nn.Module):
375
+ """SourceModule for hn-nsf
376
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
377
+ add_noise_std=0.003, voiced_threshod=0)
378
+ sampling_rate: sampling_rate in Hz
379
+ harmonic_num: number of harmonic above F0 (default: 0)
380
+ sine_amp: amplitude of sine source signal (default: 0.1)
381
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
382
+ note that amplitude of noise in unvoiced is decided
383
+ by sine_amp
384
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
385
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
386
+ F0_sampled (batchsize, length, 1)
387
+ Sine_source (batchsize, length, 1)
388
+ noise_source (batchsize, length 1)
389
+ uv (batchsize, length, 1)
390
+ """
391
+
392
+ def __init__(
393
+ self,
394
+ sampling_rate,
395
+ harmonic_num=0,
396
+ sine_amp=0.1,
397
+ add_noise_std=0.003,
398
+ voiced_threshod=0,
399
+ is_half=True,
400
+ ):
401
+ super(SourceModuleHnNSF, self).__init__()
402
+
403
+ self.sine_amp = sine_amp
404
+ self.noise_std = add_noise_std
405
+ self.is_half = is_half
406
+ # to produce sine waveforms
407
+ self.l_sin_gen = SineGen(
408
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
409
+ )
410
+
411
+ # to merge source harmonics into a single excitation
412
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
413
+ self.l_tanh = torch.nn.Tanh()
414
+
415
+ def forward(self, x, upp=None):
416
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
417
+ if self.is_half:
418
+ sine_wavs = sine_wavs.half()
419
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
420
+ return sine_merge, None, None # noise, uv
421
+
422
+
423
+ class GeneratorNSF(torch.nn.Module):
424
+ def __init__(
425
+ self,
426
+ initial_channel,
427
+ resblock,
428
+ resblock_kernel_sizes,
429
+ resblock_dilation_sizes,
430
+ upsample_rates,
431
+ upsample_initial_channel,
432
+ upsample_kernel_sizes,
433
+ gin_channels,
434
+ sr,
435
+ is_half=False,
436
+ ):
437
+ super(GeneratorNSF, self).__init__()
438
+ self.num_kernels = len(resblock_kernel_sizes)
439
+ self.num_upsamples = len(upsample_rates)
440
+
441
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
442
+ self.m_source = SourceModuleHnNSF(
443
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
444
+ )
445
+ self.noise_convs = nn.ModuleList()
446
+ self.conv_pre = Conv1d(
447
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
448
+ )
449
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
450
+
451
+ self.ups = nn.ModuleList()
452
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
453
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
454
+ self.ups.append(
455
+ weight_norm(
456
+ ConvTranspose1d(
457
+ upsample_initial_channel // (2**i),
458
+ upsample_initial_channel // (2 ** (i + 1)),
459
+ k,
460
+ u,
461
+ padding=(k - u) // 2,
462
+ )
463
+ )
464
+ )
465
+ if i + 1 < len(upsample_rates):
466
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
467
+ self.noise_convs.append(
468
+ Conv1d(
469
+ 1,
470
+ c_cur,
471
+ kernel_size=stride_f0 * 2,
472
+ stride=stride_f0,
473
+ padding=stride_f0 // 2,
474
+ )
475
+ )
476
+ else:
477
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
478
+
479
+ self.resblocks = nn.ModuleList()
480
+ for i in range(len(self.ups)):
481
+ ch = upsample_initial_channel // (2 ** (i + 1))
482
+ for j, (k, d) in enumerate(
483
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
484
+ ):
485
+ self.resblocks.append(resblock(ch, k, d))
486
+
487
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
488
+ self.ups.apply(init_weights)
489
+
490
+ if gin_channels != 0:
491
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
492
+
493
+ self.upp = np.prod(upsample_rates)
494
+
495
+ def forward(self, x, f0, g=None):
496
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
497
+ har_source = har_source.transpose(1, 2)
498
+ x = self.conv_pre(x)
499
+ if g is not None:
500
+ x = x + self.cond(g)
501
+
502
+ for i in range(self.num_upsamples):
503
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
504
+ x = self.ups[i](x)
505
+ x_source = self.noise_convs[i](har_source)
506
+ x = x + x_source
507
+ xs = None
508
+ for j in range(self.num_kernels):
509
+ if xs is None:
510
+ xs = self.resblocks[i * self.num_kernels + j](x)
511
+ else:
512
+ xs += self.resblocks[i * self.num_kernels + j](x)
513
+ x = xs / self.num_kernels
514
+ x = F.leaky_relu(x)
515
+ x = self.conv_post(x)
516
+ x = torch.tanh(x)
517
+ return x
518
+
519
+ def remove_weight_norm(self):
520
+ for l in self.ups:
521
+ remove_weight_norm(l)
522
+ for l in self.resblocks:
523
+ l.remove_weight_norm()
524
+
525
+
526
+ sr2sr = {
527
+ "32k": 32000,
528
+ "40k": 40000,
529
+ "48k": 48000,
530
+ }
531
+
532
+
533
+ class SynthesizerTrnMsNSFsidM(nn.Module):
534
+ def __init__(
535
+ self,
536
+ spec_channels,
537
+ segment_size,
538
+ inter_channels,
539
+ hidden_channels,
540
+ filter_channels,
541
+ n_heads,
542
+ n_layers,
543
+ kernel_size,
544
+ p_dropout,
545
+ resblock,
546
+ resblock_kernel_sizes,
547
+ resblock_dilation_sizes,
548
+ upsample_rates,
549
+ upsample_initial_channel,
550
+ upsample_kernel_sizes,
551
+ spk_embed_dim,
552
+ gin_channels,
553
+ sr,
554
+ version,
555
+ **kwargs
556
+ ):
557
+ super().__init__()
558
+ if type(sr) == type("strr"):
559
+ sr = sr2sr[sr]
560
+ self.spec_channels = spec_channels
561
+ self.inter_channels = inter_channels
562
+ self.hidden_channels = hidden_channels
563
+ self.filter_channels = filter_channels
564
+ self.n_heads = n_heads
565
+ self.n_layers = n_layers
566
+ self.kernel_size = kernel_size
567
+ self.p_dropout = p_dropout
568
+ self.resblock = resblock
569
+ self.resblock_kernel_sizes = resblock_kernel_sizes
570
+ self.resblock_dilation_sizes = resblock_dilation_sizes
571
+ self.upsample_rates = upsample_rates
572
+ self.upsample_initial_channel = upsample_initial_channel
573
+ self.upsample_kernel_sizes = upsample_kernel_sizes
574
+ self.segment_size = segment_size
575
+ self.gin_channels = gin_channels
576
+ # self.hop_length = hop_length#
577
+ self.spk_embed_dim = spk_embed_dim
578
+ if version == "v1":
579
+ self.enc_p = TextEncoder256(
580
+ inter_channels,
581
+ hidden_channels,
582
+ filter_channels,
583
+ n_heads,
584
+ n_layers,
585
+ kernel_size,
586
+ p_dropout,
587
+ )
588
+ else:
589
+ self.enc_p = TextEncoder768(
590
+ inter_channels,
591
+ hidden_channels,
592
+ filter_channels,
593
+ n_heads,
594
+ n_layers,
595
+ kernel_size,
596
+ p_dropout,
597
+ )
598
+ self.dec = GeneratorNSF(
599
+ inter_channels,
600
+ resblock,
601
+ resblock_kernel_sizes,
602
+ resblock_dilation_sizes,
603
+ upsample_rates,
604
+ upsample_initial_channel,
605
+ upsample_kernel_sizes,
606
+ gin_channels=gin_channels,
607
+ sr=sr,
608
+ is_half=kwargs["is_half"],
609
+ )
610
+ self.enc_q = PosteriorEncoder(
611
+ spec_channels,
612
+ inter_channels,
613
+ hidden_channels,
614
+ 5,
615
+ 1,
616
+ 16,
617
+ gin_channels=gin_channels,
618
+ )
619
+ self.flow = ResidualCouplingBlock(
620
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
621
+ )
622
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
623
+ self.speaker_map = None
624
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
625
+
626
+ def remove_weight_norm(self):
627
+ self.dec.remove_weight_norm()
628
+ self.flow.remove_weight_norm()
629
+ self.enc_q.remove_weight_norm()
630
+
631
+ def construct_spkmixmap(self, n_speaker):
632
+ self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
633
+ for i in range(n_speaker):
634
+ self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
635
+ self.speaker_map = self.speaker_map.unsqueeze(0)
636
+
637
+ def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
638
+ if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
639
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
640
+ g = g * self.speaker_map # [N, S, B, 1, H]
641
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
642
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
643
+ else:
644
+ g = g.unsqueeze(0)
645
+ g = self.emb_g(g).transpose(1, 2)
646
+
647
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
648
+ z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
649
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
650
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
651
+ return o
652
+
653
+
654
+ class MultiPeriodDiscriminator(torch.nn.Module):
655
+ def __init__(self, use_spectral_norm=False):
656
+ super(MultiPeriodDiscriminator, self).__init__()
657
+ periods = [2, 3, 5, 7, 11, 17]
658
+ # periods = [3, 5, 7, 11, 17, 23, 37]
659
+
660
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
661
+ discs = discs + [
662
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
663
+ ]
664
+ self.discriminators = nn.ModuleList(discs)
665
+
666
+ def forward(self, y, y_hat):
667
+ y_d_rs = [] #
668
+ y_d_gs = []
669
+ fmap_rs = []
670
+ fmap_gs = []
671
+ for i, d in enumerate(self.discriminators):
672
+ y_d_r, fmap_r = d(y)
673
+ y_d_g, fmap_g = d(y_hat)
674
+ # for j in range(len(fmap_r)):
675
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
676
+ y_d_rs.append(y_d_r)
677
+ y_d_gs.append(y_d_g)
678
+ fmap_rs.append(fmap_r)
679
+ fmap_gs.append(fmap_g)
680
+
681
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
682
+
683
+
684
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
685
+ def __init__(self, use_spectral_norm=False):
686
+ super(MultiPeriodDiscriminatorV2, self).__init__()
687
+ # periods = [2, 3, 5, 7, 11, 17]
688
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
689
+
690
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
691
+ discs = discs + [
692
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
693
+ ]
694
+ self.discriminators = nn.ModuleList(discs)
695
+
696
+ def forward(self, y, y_hat):
697
+ y_d_rs = [] #
698
+ y_d_gs = []
699
+ fmap_rs = []
700
+ fmap_gs = []
701
+ for i, d in enumerate(self.discriminators):
702
+ y_d_r, fmap_r = d(y)
703
+ y_d_g, fmap_g = d(y_hat)
704
+ # for j in range(len(fmap_r)):
705
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
706
+ y_d_rs.append(y_d_r)
707
+ y_d_gs.append(y_d_g)
708
+ fmap_rs.append(fmap_r)
709
+ fmap_gs.append(fmap_g)
710
+
711
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
712
+
713
+
714
+ class DiscriminatorS(torch.nn.Module):
715
+ def __init__(self, use_spectral_norm=False):
716
+ super(DiscriminatorS, self).__init__()
717
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
718
+ self.convs = nn.ModuleList(
719
+ [
720
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
721
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
722
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
723
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
724
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
725
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
726
+ ]
727
+ )
728
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
729
+
730
+ def forward(self, x):
731
+ fmap = []
732
+
733
+ for l in self.convs:
734
+ x = l(x)
735
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
736
+ fmap.append(x)
737
+ x = self.conv_post(x)
738
+ fmap.append(x)
739
+ x = torch.flatten(x, 1, -1)
740
+
741
+ return x, fmap
742
+
743
+
744
+ class DiscriminatorP(torch.nn.Module):
745
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
746
+ super(DiscriminatorP, self).__init__()
747
+ self.period = period
748
+ self.use_spectral_norm = use_spectral_norm
749
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
750
+ self.convs = nn.ModuleList(
751
+ [
752
+ norm_f(
753
+ Conv2d(
754
+ 1,
755
+ 32,
756
+ (kernel_size, 1),
757
+ (stride, 1),
758
+ padding=(get_padding(kernel_size, 1), 0),
759
+ )
760
+ ),
761
+ norm_f(
762
+ Conv2d(
763
+ 32,
764
+ 128,
765
+ (kernel_size, 1),
766
+ (stride, 1),
767
+ padding=(get_padding(kernel_size, 1), 0),
768
+ )
769
+ ),
770
+ norm_f(
771
+ Conv2d(
772
+ 128,
773
+ 512,
774
+ (kernel_size, 1),
775
+ (stride, 1),
776
+ padding=(get_padding(kernel_size, 1), 0),
777
+ )
778
+ ),
779
+ norm_f(
780
+ Conv2d(
781
+ 512,
782
+ 1024,
783
+ (kernel_size, 1),
784
+ (stride, 1),
785
+ padding=(get_padding(kernel_size, 1), 0),
786
+ )
787
+ ),
788
+ norm_f(
789
+ Conv2d(
790
+ 1024,
791
+ 1024,
792
+ (kernel_size, 1),
793
+ 1,
794
+ padding=(get_padding(kernel_size, 1), 0),
795
+ )
796
+ ),
797
+ ]
798
+ )
799
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
800
+
801
+ def forward(self, x):
802
+ fmap = []
803
+
804
+ # 1d to 2d
805
+ b, c, t = x.shape
806
+ if t % self.period != 0: # pad first
807
+ n_pad = self.period - (t % self.period)
808
+ x = F.pad(x, (0, n_pad), "reflect")
809
+ t = t + n_pad
810
+ x = x.view(b, c, t // self.period, self.period)
811
+
812
+ for l in self.convs:
813
+ x = l(x)
814
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
815
+ fmap.append(x)
816
+ x = self.conv_post(x)
817
+ fmap.append(x)
818
+ x = torch.flatten(x, 1, -1)
819
+
820
+ return x, fmap
lib/infer_pack/modules.py CHANGED
@@ -1,522 +1,524 @@
1
- import copy
2
- import math
3
- import numpy as np
4
- import scipy
5
- import torch
6
- from torch import nn
7
- from torch.nn import functional as F
8
-
9
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
- from torch.nn.utils import weight_norm, remove_weight_norm
11
-
12
- from lib.infer_pack import commons
13
- from lib.infer_pack.commons import init_weights, get_padding
14
- from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
15
-
16
-
17
- LRELU_SLOPE = 0.1
18
-
19
-
20
- class LayerNorm(nn.Module):
21
- def __init__(self, channels, eps=1e-5):
22
- super().__init__()
23
- self.channels = channels
24
- self.eps = eps
25
-
26
- self.gamma = nn.Parameter(torch.ones(channels))
27
- self.beta = nn.Parameter(torch.zeros(channels))
28
-
29
- def forward(self, x):
30
- x = x.transpose(1, -1)
31
- x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
- return x.transpose(1, -1)
33
-
34
-
35
- class ConvReluNorm(nn.Module):
36
- def __init__(
37
- self,
38
- in_channels,
39
- hidden_channels,
40
- out_channels,
41
- kernel_size,
42
- n_layers,
43
- p_dropout,
44
- ):
45
- super().__init__()
46
- self.in_channels = in_channels
47
- self.hidden_channels = hidden_channels
48
- self.out_channels = out_channels
49
- self.kernel_size = kernel_size
50
- self.n_layers = n_layers
51
- self.p_dropout = p_dropout
52
- assert n_layers > 1, "Number of layers should be larger than 0."
53
-
54
- self.conv_layers = nn.ModuleList()
55
- self.norm_layers = nn.ModuleList()
56
- self.conv_layers.append(
57
- nn.Conv1d(
58
- in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
59
- )
60
- )
61
- self.norm_layers.append(LayerNorm(hidden_channels))
62
- self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
63
- for _ in range(n_layers - 1):
64
- self.conv_layers.append(
65
- nn.Conv1d(
66
- hidden_channels,
67
- hidden_channels,
68
- kernel_size,
69
- padding=kernel_size // 2,
70
- )
71
- )
72
- self.norm_layers.append(LayerNorm(hidden_channels))
73
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
74
- self.proj.weight.data.zero_()
75
- self.proj.bias.data.zero_()
76
-
77
- def forward(self, x, x_mask):
78
- x_org = x
79
- for i in range(self.n_layers):
80
- x = self.conv_layers[i](x * x_mask)
81
- x = self.norm_layers[i](x)
82
- x = self.relu_drop(x)
83
- x = x_org + self.proj(x)
84
- return x * x_mask
85
-
86
-
87
- class DDSConv(nn.Module):
88
- """
89
- Dialted and Depth-Separable Convolution
90
- """
91
-
92
- def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
93
- super().__init__()
94
- self.channels = channels
95
- self.kernel_size = kernel_size
96
- self.n_layers = n_layers
97
- self.p_dropout = p_dropout
98
-
99
- self.drop = nn.Dropout(p_dropout)
100
- self.convs_sep = nn.ModuleList()
101
- self.convs_1x1 = nn.ModuleList()
102
- self.norms_1 = nn.ModuleList()
103
- self.norms_2 = nn.ModuleList()
104
- for i in range(n_layers):
105
- dilation = kernel_size**i
106
- padding = (kernel_size * dilation - dilation) // 2
107
- self.convs_sep.append(
108
- nn.Conv1d(
109
- channels,
110
- channels,
111
- kernel_size,
112
- groups=channels,
113
- dilation=dilation,
114
- padding=padding,
115
- )
116
- )
117
- self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
118
- self.norms_1.append(LayerNorm(channels))
119
- self.norms_2.append(LayerNorm(channels))
120
-
121
- def forward(self, x, x_mask, g=None):
122
- if g is not None:
123
- x = x + g
124
- for i in range(self.n_layers):
125
- y = self.convs_sep[i](x * x_mask)
126
- y = self.norms_1[i](y)
127
- y = F.gelu(y)
128
- y = self.convs_1x1[i](y)
129
- y = self.norms_2[i](y)
130
- y = F.gelu(y)
131
- y = self.drop(y)
132
- x = x + y
133
- return x * x_mask
134
-
135
-
136
- class WN(torch.nn.Module):
137
- def __init__(
138
- self,
139
- hidden_channels,
140
- kernel_size,
141
- dilation_rate,
142
- n_layers,
143
- gin_channels=0,
144
- p_dropout=0,
145
- ):
146
- super(WN, self).__init__()
147
- assert kernel_size % 2 == 1
148
- self.hidden_channels = hidden_channels
149
- self.kernel_size = (kernel_size,)
150
- self.dilation_rate = dilation_rate
151
- self.n_layers = n_layers
152
- self.gin_channels = gin_channels
153
- self.p_dropout = p_dropout
154
-
155
- self.in_layers = torch.nn.ModuleList()
156
- self.res_skip_layers = torch.nn.ModuleList()
157
- self.drop = nn.Dropout(p_dropout)
158
-
159
- if gin_channels != 0:
160
- cond_layer = torch.nn.Conv1d(
161
- gin_channels, 2 * hidden_channels * n_layers, 1
162
- )
163
- self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
164
-
165
- for i in range(n_layers):
166
- dilation = dilation_rate**i
167
- padding = int((kernel_size * dilation - dilation) / 2)
168
- in_layer = torch.nn.Conv1d(
169
- hidden_channels,
170
- 2 * hidden_channels,
171
- kernel_size,
172
- dilation=dilation,
173
- padding=padding,
174
- )
175
- in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
176
- self.in_layers.append(in_layer)
177
-
178
- # last one is not necessary
179
- if i < n_layers - 1:
180
- res_skip_channels = 2 * hidden_channels
181
- else:
182
- res_skip_channels = hidden_channels
183
-
184
- res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
185
- res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
186
- self.res_skip_layers.append(res_skip_layer)
187
-
188
- def forward(self, x, x_mask, g=None, **kwargs):
189
- output = torch.zeros_like(x)
190
- n_channels_tensor = torch.IntTensor([self.hidden_channels])
191
-
192
- if g is not None:
193
- g = self.cond_layer(g)
194
-
195
- for i in range(self.n_layers):
196
- x_in = self.in_layers[i](x)
197
- if g is not None:
198
- cond_offset = i * 2 * self.hidden_channels
199
- g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
200
- else:
201
- g_l = torch.zeros_like(x_in)
202
-
203
- acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
204
- acts = self.drop(acts)
205
-
206
- res_skip_acts = self.res_skip_layers[i](acts)
207
- if i < self.n_layers - 1:
208
- res_acts = res_skip_acts[:, : self.hidden_channels, :]
209
- x = (x + res_acts) * x_mask
210
- output = output + res_skip_acts[:, self.hidden_channels :, :]
211
- else:
212
- output = output + res_skip_acts
213
- return output * x_mask
214
-
215
- def remove_weight_norm(self):
216
- if self.gin_channels != 0:
217
- torch.nn.utils.remove_weight_norm(self.cond_layer)
218
- for l in self.in_layers:
219
- torch.nn.utils.remove_weight_norm(l)
220
- for l in self.res_skip_layers:
221
- torch.nn.utils.remove_weight_norm(l)
222
-
223
-
224
- class ResBlock1(torch.nn.Module):
225
- def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
226
- super(ResBlock1, self).__init__()
227
- self.convs1 = nn.ModuleList(
228
- [
229
- weight_norm(
230
- Conv1d(
231
- channels,
232
- channels,
233
- kernel_size,
234
- 1,
235
- dilation=dilation[0],
236
- padding=get_padding(kernel_size, dilation[0]),
237
- )
238
- ),
239
- weight_norm(
240
- Conv1d(
241
- channels,
242
- channels,
243
- kernel_size,
244
- 1,
245
- dilation=dilation[1],
246
- padding=get_padding(kernel_size, dilation[1]),
247
- )
248
- ),
249
- weight_norm(
250
- Conv1d(
251
- channels,
252
- channels,
253
- kernel_size,
254
- 1,
255
- dilation=dilation[2],
256
- padding=get_padding(kernel_size, dilation[2]),
257
- )
258
- ),
259
- ]
260
- )
261
- self.convs1.apply(init_weights)
262
-
263
- self.convs2 = nn.ModuleList(
264
- [
265
- weight_norm(
266
- Conv1d(
267
- channels,
268
- channels,
269
- kernel_size,
270
- 1,
271
- dilation=1,
272
- padding=get_padding(kernel_size, 1),
273
- )
274
- ),
275
- weight_norm(
276
- Conv1d(
277
- channels,
278
- channels,
279
- kernel_size,
280
- 1,
281
- dilation=1,
282
- padding=get_padding(kernel_size, 1),
283
- )
284
- ),
285
- weight_norm(
286
- Conv1d(
287
- channels,
288
- channels,
289
- kernel_size,
290
- 1,
291
- dilation=1,
292
- padding=get_padding(kernel_size, 1),
293
- )
294
- ),
295
- ]
296
- )
297
- self.convs2.apply(init_weights)
298
-
299
- def forward(self, x, x_mask=None):
300
- for c1, c2 in zip(self.convs1, self.convs2):
301
- xt = F.leaky_relu(x, LRELU_SLOPE)
302
- if x_mask is not None:
303
- xt = xt * x_mask
304
- xt = c1(xt)
305
- xt = F.leaky_relu(xt, LRELU_SLOPE)
306
- if x_mask is not None:
307
- xt = xt * x_mask
308
- xt = c2(xt)
309
- x = xt + x
310
- if x_mask is not None:
311
- x = x * x_mask
312
- return x
313
-
314
- def remove_weight_norm(self):
315
- for l in self.convs1:
316
- remove_weight_norm(l)
317
- for l in self.convs2:
318
- remove_weight_norm(l)
319
-
320
-
321
- class ResBlock2(torch.nn.Module):
322
- def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
323
- super(ResBlock2, self).__init__()
324
- self.convs = nn.ModuleList(
325
- [
326
- weight_norm(
327
- Conv1d(
328
- channels,
329
- channels,
330
- kernel_size,
331
- 1,
332
- dilation=dilation[0],
333
- padding=get_padding(kernel_size, dilation[0]),
334
- )
335
- ),
336
- weight_norm(
337
- Conv1d(
338
- channels,
339
- channels,
340
- kernel_size,
341
- 1,
342
- dilation=dilation[1],
343
- padding=get_padding(kernel_size, dilation[1]),
344
- )
345
- ),
346
- ]
347
- )
348
- self.convs.apply(init_weights)
349
-
350
- def forward(self, x, x_mask=None):
351
- for c in self.convs:
352
- xt = F.leaky_relu(x, LRELU_SLOPE)
353
- if x_mask is not None:
354
- xt = xt * x_mask
355
- xt = c(xt)
356
- x = xt + x
357
- if x_mask is not None:
358
- x = x * x_mask
359
- return x
360
-
361
- def remove_weight_norm(self):
362
- for l in self.convs:
363
- remove_weight_norm(l)
364
-
365
-
366
- class Log(nn.Module):
367
- def forward(self, x, x_mask, reverse=False, **kwargs):
368
- if not reverse:
369
- y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
370
- logdet = torch.sum(-y, [1, 2])
371
- return y, logdet
372
- else:
373
- x = torch.exp(x) * x_mask
374
- return x
375
-
376
-
377
- class Flip(nn.Module):
378
- def forward(self, x, *args, reverse=False, **kwargs):
379
- x = torch.flip(x, [1])
380
- if not reverse:
381
- logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
382
- return x, logdet
383
- else:
384
- return x
385
-
386
-
387
- class ElementwiseAffine(nn.Module):
388
- def __init__(self, channels):
389
- super().__init__()
390
- self.channels = channels
391
- self.m = nn.Parameter(torch.zeros(channels, 1))
392
- self.logs = nn.Parameter(torch.zeros(channels, 1))
393
-
394
- def forward(self, x, x_mask, reverse=False, **kwargs):
395
- if not reverse:
396
- y = self.m + torch.exp(self.logs) * x
397
- y = y * x_mask
398
- logdet = torch.sum(self.logs * x_mask, [1, 2])
399
- return y, logdet
400
- else:
401
- x = (x - self.m) * torch.exp(-self.logs) * x_mask
402
- return x
403
-
404
-
405
- class ResidualCouplingLayer(nn.Module):
406
- def __init__(
407
- self,
408
- channels,
409
- hidden_channels,
410
- kernel_size,
411
- dilation_rate,
412
- n_layers,
413
- p_dropout=0,
414
- gin_channels=0,
415
- mean_only=False,
416
- ):
417
- assert channels % 2 == 0, "channels should be divisible by 2"
418
- super().__init__()
419
- self.channels = channels
420
- self.hidden_channels = hidden_channels
421
- self.kernel_size = kernel_size
422
- self.dilation_rate = dilation_rate
423
- self.n_layers = n_layers
424
- self.half_channels = channels // 2
425
- self.mean_only = mean_only
426
-
427
- self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
428
- self.enc = WN(
429
- hidden_channels,
430
- kernel_size,
431
- dilation_rate,
432
- n_layers,
433
- p_dropout=p_dropout,
434
- gin_channels=gin_channels,
435
- )
436
- self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
437
- self.post.weight.data.zero_()
438
- self.post.bias.data.zero_()
439
-
440
- def forward(self, x, x_mask, g=None, reverse=False):
441
- x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
442
- h = self.pre(x0) * x_mask
443
- h = self.enc(h, x_mask, g=g)
444
- stats = self.post(h) * x_mask
445
- if not self.mean_only:
446
- m, logs = torch.split(stats, [self.half_channels] * 2, 1)
447
- else:
448
- m = stats
449
- logs = torch.zeros_like(m)
450
-
451
- if not reverse:
452
- x1 = m + x1 * torch.exp(logs) * x_mask
453
- x = torch.cat([x0, x1], 1)
454
- logdet = torch.sum(logs, [1, 2])
455
- return x, logdet
456
- else:
457
- x1 = (x1 - m) * torch.exp(-logs) * x_mask
458
- x = torch.cat([x0, x1], 1)
459
- return x
460
-
461
- def remove_weight_norm(self):
462
- self.enc.remove_weight_norm()
463
-
464
-
465
- class ConvFlow(nn.Module):
466
- def __init__(
467
- self,
468
- in_channels,
469
- filter_channels,
470
- kernel_size,
471
- n_layers,
472
- num_bins=10,
473
- tail_bound=5.0,
474
- ):
475
- super().__init__()
476
- self.in_channels = in_channels
477
- self.filter_channels = filter_channels
478
- self.kernel_size = kernel_size
479
- self.n_layers = n_layers
480
- self.num_bins = num_bins
481
- self.tail_bound = tail_bound
482
- self.half_channels = in_channels // 2
483
-
484
- self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
485
- self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
486
- self.proj = nn.Conv1d(
487
- filter_channels, self.half_channels * (num_bins * 3 - 1), 1
488
- )
489
- self.proj.weight.data.zero_()
490
- self.proj.bias.data.zero_()
491
-
492
- def forward(self, x, x_mask, g=None, reverse=False):
493
- x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
494
- h = self.pre(x0)
495
- h = self.convs(h, x_mask, g=g)
496
- h = self.proj(h) * x_mask
497
-
498
- b, c, t = x0.shape
499
- h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
500
-
501
- unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
502
- unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
503
- self.filter_channels
504
- )
505
- unnormalized_derivatives = h[..., 2 * self.num_bins :]
506
-
507
- x1, logabsdet = piecewise_rational_quadratic_transform(
508
- x1,
509
- unnormalized_widths,
510
- unnormalized_heights,
511
- unnormalized_derivatives,
512
- inverse=reverse,
513
- tails="linear",
514
- tail_bound=self.tail_bound,
515
- )
516
-
517
- x = torch.cat([x0, x1], 1) * x_mask
518
- logdet = torch.sum(logabsdet * x_mask, [1, 2])
519
- if not reverse:
520
- return x, logdet
521
- else:
522
- return x
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import remove_weight_norm
11
+ from torch.nn.utils.parametrizations import weight_norm
12
+
13
+
14
+ from lib.infer_pack import commons
15
+ from lib.infer_pack.commons import init_weights, get_padding
16
+ from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
17
+
18
+
19
+ LRELU_SLOPE = 0.1
20
+
21
+
22
+ class LayerNorm(nn.Module):
23
+ def __init__(self, channels, eps=1e-5):
24
+ super().__init__()
25
+ self.channels = channels
26
+ self.eps = eps
27
+
28
+ self.gamma = nn.Parameter(torch.ones(channels))
29
+ self.beta = nn.Parameter(torch.zeros(channels))
30
+
31
+ def forward(self, x):
32
+ x = x.transpose(1, -1)
33
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
34
+ return x.transpose(1, -1)
35
+
36
+
37
+ class ConvReluNorm(nn.Module):
38
+ def __init__(
39
+ self,
40
+ in_channels,
41
+ hidden_channels,
42
+ out_channels,
43
+ kernel_size,
44
+ n_layers,
45
+ p_dropout,
46
+ ):
47
+ super().__init__()
48
+ self.in_channels = in_channels
49
+ self.hidden_channels = hidden_channels
50
+ self.out_channels = out_channels
51
+ self.kernel_size = kernel_size
52
+ self.n_layers = n_layers
53
+ self.p_dropout = p_dropout
54
+ assert n_layers > 1, "Number of layers should be larger than 0."
55
+
56
+ self.conv_layers = nn.ModuleList()
57
+ self.norm_layers = nn.ModuleList()
58
+ self.conv_layers.append(
59
+ nn.Conv1d(
60
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
61
+ )
62
+ )
63
+ self.norm_layers.append(LayerNorm(hidden_channels))
64
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
65
+ for _ in range(n_layers - 1):
66
+ self.conv_layers.append(
67
+ nn.Conv1d(
68
+ hidden_channels,
69
+ hidden_channels,
70
+ kernel_size,
71
+ padding=kernel_size // 2,
72
+ )
73
+ )
74
+ self.norm_layers.append(LayerNorm(hidden_channels))
75
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
76
+ self.proj.weight.data.zero_()
77
+ self.proj.bias.data.zero_()
78
+
79
+ def forward(self, x, x_mask):
80
+ x_org = x
81
+ for i in range(self.n_layers):
82
+ x = self.conv_layers[i](x * x_mask)
83
+ x = self.norm_layers[i](x)
84
+ x = self.relu_drop(x)
85
+ x = x_org + self.proj(x)
86
+ return x * x_mask
87
+
88
+
89
+ class DDSConv(nn.Module):
90
+ """
91
+ Dialted and Depth-Separable Convolution
92
+ """
93
+
94
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
95
+ super().__init__()
96
+ self.channels = channels
97
+ self.kernel_size = kernel_size
98
+ self.n_layers = n_layers
99
+ self.p_dropout = p_dropout
100
+
101
+ self.drop = nn.Dropout(p_dropout)
102
+ self.convs_sep = nn.ModuleList()
103
+ self.convs_1x1 = nn.ModuleList()
104
+ self.norms_1 = nn.ModuleList()
105
+ self.norms_2 = nn.ModuleList()
106
+ for i in range(n_layers):
107
+ dilation = kernel_size**i
108
+ padding = (kernel_size * dilation - dilation) // 2
109
+ self.convs_sep.append(
110
+ nn.Conv1d(
111
+ channels,
112
+ channels,
113
+ kernel_size,
114
+ groups=channels,
115
+ dilation=dilation,
116
+ padding=padding,
117
+ )
118
+ )
119
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
120
+ self.norms_1.append(LayerNorm(channels))
121
+ self.norms_2.append(LayerNorm(channels))
122
+
123
+ def forward(self, x, x_mask, g=None):
124
+ if g is not None:
125
+ x = x + g
126
+ for i in range(self.n_layers):
127
+ y = self.convs_sep[i](x * x_mask)
128
+ y = self.norms_1[i](y)
129
+ y = F.gelu(y)
130
+ y = self.convs_1x1[i](y)
131
+ y = self.norms_2[i](y)
132
+ y = F.gelu(y)
133
+ y = self.drop(y)
134
+ x = x + y
135
+ return x * x_mask
136
+
137
+
138
+ class WN(torch.nn.Module):
139
+ def __init__(
140
+ self,
141
+ hidden_channels,
142
+ kernel_size,
143
+ dilation_rate,
144
+ n_layers,
145
+ gin_channels=0,
146
+ p_dropout=0,
147
+ ):
148
+ super(WN, self).__init__()
149
+ assert kernel_size % 2 == 1
150
+ self.hidden_channels = hidden_channels
151
+ self.kernel_size = (kernel_size,)
152
+ self.dilation_rate = dilation_rate
153
+ self.n_layers = n_layers
154
+ self.gin_channels = gin_channels
155
+ self.p_dropout = p_dropout
156
+
157
+ self.in_layers = torch.nn.ModuleList()
158
+ self.res_skip_layers = torch.nn.ModuleList()
159
+ self.drop = nn.Dropout(p_dropout)
160
+
161
+ if gin_channels != 0:
162
+ cond_layer = torch.nn.Conv1d(
163
+ gin_channels, 2 * hidden_channels * n_layers, 1
164
+ )
165
+ self.cond_layer = torch.nn.utils.parametrizations.weight_norm(cond_layer, name="weight")
166
+
167
+ for i in range(n_layers):
168
+ dilation = dilation_rate**i
169
+ padding = int((kernel_size * dilation - dilation) / 2)
170
+ in_layer = torch.nn.Conv1d(
171
+ hidden_channels,
172
+ 2 * hidden_channels,
173
+ kernel_size,
174
+ dilation=dilation,
175
+ padding=padding,
176
+ )
177
+ in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight")
178
+ self.in_layers.append(in_layer)
179
+
180
+ # last one is not necessary
181
+ if i < n_layers - 1:
182
+ res_skip_channels = 2 * hidden_channels
183
+ else:
184
+ res_skip_channels = hidden_channels
185
+
186
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
187
+ res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight")
188
+ self.res_skip_layers.append(res_skip_layer)
189
+
190
+ def forward(self, x, x_mask, g=None, **kwargs):
191
+ output = torch.zeros_like(x)
192
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
193
+
194
+ if g is not None:
195
+ g = self.cond_layer(g)
196
+
197
+ for i in range(self.n_layers):
198
+ x_in = self.in_layers[i](x)
199
+ if g is not None:
200
+ cond_offset = i * 2 * self.hidden_channels
201
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
202
+ else:
203
+ g_l = torch.zeros_like(x_in)
204
+
205
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
206
+ acts = self.drop(acts)
207
+
208
+ res_skip_acts = self.res_skip_layers[i](acts)
209
+ if i < self.n_layers - 1:
210
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
211
+ x = (x + res_acts) * x_mask
212
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
213
+ else:
214
+ output = output + res_skip_acts
215
+ return output * x_mask
216
+
217
+ def remove_weight_norm(self):
218
+ if self.gin_channels != 0:
219
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
220
+ for l in self.in_layers:
221
+ torch.nn.utils.remove_weight_norm(l)
222
+ for l in self.res_skip_layers:
223
+ torch.nn.utils.remove_weight_norm(l)
224
+
225
+
226
+ class ResBlock1(torch.nn.Module):
227
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
228
+ super(ResBlock1, self).__init__()
229
+ self.convs1 = nn.ModuleList(
230
+ [
231
+ weight_norm(
232
+ Conv1d(
233
+ channels,
234
+ channels,
235
+ kernel_size,
236
+ 1,
237
+ dilation=dilation[0],
238
+ padding=get_padding(kernel_size, dilation[0]),
239
+ )
240
+ ),
241
+ weight_norm(
242
+ Conv1d(
243
+ channels,
244
+ channels,
245
+ kernel_size,
246
+ 1,
247
+ dilation=dilation[1],
248
+ padding=get_padding(kernel_size, dilation[1]),
249
+ )
250
+ ),
251
+ weight_norm(
252
+ Conv1d(
253
+ channels,
254
+ channels,
255
+ kernel_size,
256
+ 1,
257
+ dilation=dilation[2],
258
+ padding=get_padding(kernel_size, dilation[2]),
259
+ )
260
+ ),
261
+ ]
262
+ )
263
+ self.convs1.apply(init_weights)
264
+
265
+ self.convs2 = nn.ModuleList(
266
+ [
267
+ weight_norm(
268
+ Conv1d(
269
+ channels,
270
+ channels,
271
+ kernel_size,
272
+ 1,
273
+ dilation=1,
274
+ padding=get_padding(kernel_size, 1),
275
+ )
276
+ ),
277
+ weight_norm(
278
+ Conv1d(
279
+ channels,
280
+ channels,
281
+ kernel_size,
282
+ 1,
283
+ dilation=1,
284
+ padding=get_padding(kernel_size, 1),
285
+ )
286
+ ),
287
+ weight_norm(
288
+ Conv1d(
289
+ channels,
290
+ channels,
291
+ kernel_size,
292
+ 1,
293
+ dilation=1,
294
+ padding=get_padding(kernel_size, 1),
295
+ )
296
+ ),
297
+ ]
298
+ )
299
+ self.convs2.apply(init_weights)
300
+
301
+ def forward(self, x, x_mask=None):
302
+ for c1, c2 in zip(self.convs1, self.convs2):
303
+ xt = F.leaky_relu(x, LRELU_SLOPE)
304
+ if x_mask is not None:
305
+ xt = xt * x_mask
306
+ xt = c1(xt)
307
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
308
+ if x_mask is not None:
309
+ xt = xt * x_mask
310
+ xt = c2(xt)
311
+ x = xt + x
312
+ if x_mask is not None:
313
+ x = x * x_mask
314
+ return x
315
+
316
+ def remove_weight_norm(self):
317
+ for l in self.convs1:
318
+ remove_weight_norm(l)
319
+ for l in self.convs2:
320
+ remove_weight_norm(l)
321
+
322
+
323
+ class ResBlock2(torch.nn.Module):
324
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
325
+ super(ResBlock2, self).__init__()
326
+ self.convs = nn.ModuleList(
327
+ [
328
+ weight_norm(
329
+ Conv1d(
330
+ channels,
331
+ channels,
332
+ kernel_size,
333
+ 1,
334
+ dilation=dilation[0],
335
+ padding=get_padding(kernel_size, dilation[0]),
336
+ )
337
+ ),
338
+ weight_norm(
339
+ Conv1d(
340
+ channels,
341
+ channels,
342
+ kernel_size,
343
+ 1,
344
+ dilation=dilation[1],
345
+ padding=get_padding(kernel_size, dilation[1]),
346
+ )
347
+ ),
348
+ ]
349
+ )
350
+ self.convs.apply(init_weights)
351
+
352
+ def forward(self, x, x_mask=None):
353
+ for c in self.convs:
354
+ xt = F.leaky_relu(x, LRELU_SLOPE)
355
+ if x_mask is not None:
356
+ xt = xt * x_mask
357
+ xt = c(xt)
358
+ x = xt + x
359
+ if x_mask is not None:
360
+ x = x * x_mask
361
+ return x
362
+
363
+ def remove_weight_norm(self):
364
+ for l in self.convs:
365
+ remove_weight_norm(l)
366
+
367
+
368
+ class Log(nn.Module):
369
+ def forward(self, x, x_mask, reverse=False, **kwargs):
370
+ if not reverse:
371
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
372
+ logdet = torch.sum(-y, [1, 2])
373
+ return y, logdet
374
+ else:
375
+ x = torch.exp(x) * x_mask
376
+ return x
377
+
378
+
379
+ class Flip(nn.Module):
380
+ def forward(self, x, *args, reverse=False, **kwargs):
381
+ x = torch.flip(x, [1])
382
+ if not reverse:
383
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
384
+ return x, logdet
385
+ else:
386
+ return x
387
+
388
+
389
+ class ElementwiseAffine(nn.Module):
390
+ def __init__(self, channels):
391
+ super().__init__()
392
+ self.channels = channels
393
+ self.m = nn.Parameter(torch.zeros(channels, 1))
394
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
395
+
396
+ def forward(self, x, x_mask, reverse=False, **kwargs):
397
+ if not reverse:
398
+ y = self.m + torch.exp(self.logs) * x
399
+ y = y * x_mask
400
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
401
+ return y, logdet
402
+ else:
403
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
404
+ return x
405
+
406
+
407
+ class ResidualCouplingLayer(nn.Module):
408
+ def __init__(
409
+ self,
410
+ channels,
411
+ hidden_channels,
412
+ kernel_size,
413
+ dilation_rate,
414
+ n_layers,
415
+ p_dropout=0,
416
+ gin_channels=0,
417
+ mean_only=False,
418
+ ):
419
+ assert channels % 2 == 0, "channels should be divisible by 2"
420
+ super().__init__()
421
+ self.channels = channels
422
+ self.hidden_channels = hidden_channels
423
+ self.kernel_size = kernel_size
424
+ self.dilation_rate = dilation_rate
425
+ self.n_layers = n_layers
426
+ self.half_channels = channels // 2
427
+ self.mean_only = mean_only
428
+
429
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
430
+ self.enc = WN(
431
+ hidden_channels,
432
+ kernel_size,
433
+ dilation_rate,
434
+ n_layers,
435
+ p_dropout=p_dropout,
436
+ gin_channels=gin_channels,
437
+ )
438
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
439
+ self.post.weight.data.zero_()
440
+ self.post.bias.data.zero_()
441
+
442
+ def forward(self, x, x_mask, g=None, reverse=False):
443
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
444
+ h = self.pre(x0) * x_mask
445
+ h = self.enc(h, x_mask, g=g)
446
+ stats = self.post(h) * x_mask
447
+ if not self.mean_only:
448
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
449
+ else:
450
+ m = stats
451
+ logs = torch.zeros_like(m)
452
+
453
+ if not reverse:
454
+ x1 = m + x1 * torch.exp(logs) * x_mask
455
+ x = torch.cat([x0, x1], 1)
456
+ logdet = torch.sum(logs, [1, 2])
457
+ return x, logdet
458
+ else:
459
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
460
+ x = torch.cat([x0, x1], 1)
461
+ return x
462
+
463
+ def remove_weight_norm(self):
464
+ self.enc.remove_weight_norm()
465
+
466
+
467
+ class ConvFlow(nn.Module):
468
+ def __init__(
469
+ self,
470
+ in_channels,
471
+ filter_channels,
472
+ kernel_size,
473
+ n_layers,
474
+ num_bins=10,
475
+ tail_bound=5.0,
476
+ ):
477
+ super().__init__()
478
+ self.in_channels = in_channels
479
+ self.filter_channels = filter_channels
480
+ self.kernel_size = kernel_size
481
+ self.n_layers = n_layers
482
+ self.num_bins = num_bins
483
+ self.tail_bound = tail_bound
484
+ self.half_channels = in_channels // 2
485
+
486
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
487
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
488
+ self.proj = nn.Conv1d(
489
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
490
+ )
491
+ self.proj.weight.data.zero_()
492
+ self.proj.bias.data.zero_()
493
+
494
+ def forward(self, x, x_mask, g=None, reverse=False):
495
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
496
+ h = self.pre(x0)
497
+ h = self.convs(h, x_mask, g=g)
498
+ h = self.proj(h) * x_mask
499
+
500
+ b, c, t = x0.shape
501
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
502
+
503
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
504
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
505
+ self.filter_channels
506
+ )
507
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
508
+
509
+ x1, logabsdet = piecewise_rational_quadratic_transform(
510
+ x1,
511
+ unnormalized_widths,
512
+ unnormalized_heights,
513
+ unnormalized_derivatives,
514
+ inverse=reverse,
515
+ tails="linear",
516
+ tail_bound=self.tail_bound,
517
+ )
518
+
519
+ x = torch.cat([x0, x1], 1) * x_mask
520
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
521
+ if not reverse:
522
+ return x, logdet
523
+ else:
524
+ return x
weights/hololive-en/Cecilia/Cecilia_KitLemonfoot.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49cf63e1d99b137f1c5734545c590c5f1175927368e473c132baa5d65b351e76
3
+ size 55225160
weights/hololive-en/Cecilia/added_IVF1477_Flat_nprobe_1_CeciliaImmergreen_Singing_KitLemonfoot_v2_mbkm.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:292ea4eb3224095fe0560dfab73999094c117353bfd0abc84258bccbf69a02cd
3
+ size 31588619
weights/hololive-en/Cecilia/cover.png ADDED
weights/hololive-en/Elizabeth/Elizabeth_Dacoolkid.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd31bb108cbf968b6a98098c3037e733dc36b5ef532055d89b8d3ce3b427b351
3
+ size 55232074
weights/hololive-en/Elizabeth/added_IVF1418_Flat_nprobe_1_Elizabeth_Rose_Bloodflame_v2.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c310911110fa2e875c3ea7e065a4173b276f0694a52e4f508fd6f1d27380637f
3
+ size 174793219
weights/hololive-en/Elizabeth/added_IVF1418_Flat_nprobe_1_Elizabeth_Rose_Bloodflame_v2_mbkm.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27826f42edf3b7ef4f0b4345fb4bbf59a4c414bed4e3703e81ec53353cd4af96
3
+ size 31588619
weights/hololive-en/Elizabeth/cover.png ADDED
weights/hololive-en/Gigi/Gigi_HinaBl.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6043992dc8e12ef42ff44955f44cc45bc7c328e9c19012e5836e76d095591e8
3
+ size 56255925
weights/hololive-en/Gigi/added_IVF1648_Flat_nprobe_1_gigi-murin_v2_mbkm.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74e8b3ecf60ac95168e5a1c3efc579f62d4503e6ca805f4c64d408408eb8a25e
3
+ size 31588619
weights/hololive-en/Gigi/cover.png ADDED
weights/hololive-en/Raora/Raora_00a.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3908f2f5946b33a4c980a2fca210613209e72446afc9287a24d4818e49d5730
3
+ size 55222703
weights/hololive-en/Raora/added_IVF2050_Flat_nprobe_1_raora_v2_mbkm.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee4410f5264cac705c4a9722cb568c8d09de1ad1990795e191432a9d05c83cad
3
+ size 31588619
weights/hololive-en/Raora/cover.png ADDED
weights/hololive-en/model_info.json CHANGED
@@ -1,130 +1,162 @@
1
- {
2
- "Kiara": {
3
- "enable": true,
4
- "model_path": "Kiara_Dacoolkid.pth",
5
- "title": "Takanashi Kiara",
6
- "cover": "cover.png",
7
- "feature_retrieval_library": "added_IVF4961_Flat_nprobe_1_mbkm.index",
8
- "author": "dacoolkid44 & Hijack"
9
- },
10
- "Calliope": {
11
- "enable": true,
12
- "model_path": "Calli_RigidSpinner.pth",
13
- "title": "Mori Calliope",
14
- "cover": "cover.png",
15
- "feature_retrieval_library": "added_IVF4173_Flat_nprobe_1_CalliopeMori_v2_mbkm.index",
16
- "author": "RigidSpinner"
17
- },
18
- "Ina": {
19
- "enable": true,
20
- "model_path": "Ina_Dacoolkid.pth",
21
- "title": "Ninomae Ina'nis",
22
- "cover": "cover.png",
23
- "feature_retrieval_library": "added_IVF1754_Flat_nprobe_1_Inatalk2_v2_mbkm.index",
24
- "author": "dacoolkid44 & Hijack"
25
- },
26
- "Gura": {
27
- "enable": true,
28
- "model_path": "Gura_Mustar.pth",
29
- "title": "Gawr Gura",
30
- "cover": "cover.png",
31
- "feature_retrieval_library": "added_IVF256_Flat_nprobe_1_Gura_v2.index",
32
- "author": "MUSTAR"
33
- },
34
- "Amelia": {
35
- "enable": true,
36
- "model_path": "Amelia_Dacoolkid.pth",
37
- "title": "Amelia Watson",
38
- "cover": "cover.png",
39
- "feature_retrieval_library": "added_IVF4964_Flat_nprobe_1_Amelia_v2_mbkm.index",
40
- "author": "dacoolkid44 & Hijack"
41
- },
42
- "IRyS": {
43
- "enable": true,
44
- "model_path": "Irys_Mimizukari.pth",
45
- "title": "IRyS",
46
- "cover": "cover.png",
47
- "feature_retrieval_library": "added_IVF3197_Flat_nprobe_1_Irys_v2_mbkm.index",
48
- "author": "Mimizukari"
49
- },
50
- "Sana": {
51
- "enable": true,
52
- "model_path": "Sana_KitLemonfoot.pth",
53
- "title": "Tsukumo Sana",
54
- "cover": "cover.png",
55
- "feature_retrieval_library": "added_IVF3032_Flat_nprobe_1_v2_mbkm.index",
56
- "author": "Kit Lemonfoot / NSHFB"
57
- },
58
- "Fauna": {
59
- "enable": true,
60
- "model_path": "Fauna_TataSoto.pth",
61
- "title": "Ceres Fauna",
62
- "cover": "cover.png",
63
- "feature_retrieval_library": "fauna_mbkm.index",
64
- "author": "TataSoto"
65
- },
66
- "Kronii": {
67
- "enable": true,
68
- "model_path": "Kronii_Dacoolkid.pth",
69
- "title": "Ouro Kronii",
70
- "cover": "cover.png",
71
- "feature_retrieval_library": "added_IVF1728_Flat_nprobe_1_kronii_v2_mbkm.index",
72
- "author": "dacoolkid44 & Hijack"
73
- },
74
- "Mumei": {
75
- "enable": true,
76
- "model_path": "Mumei_Dacoolkid.pth",
77
- "title": "Nanashi Mumei",
78
- "cover": "cover.png",
79
- "feature_retrieval_library": "added_IVF1020_Flat_nprobe_1_MUMEI_v2_mbkm.index",
80
- "author": "dacoolkid44 & Hijack"
81
- },
82
- "Baelz": {
83
- "enable": true,
84
- "model_path": "Baelz_Dacoolkid.pth",
85
- "title": "Hakos Baelz",
86
- "cover": "cover.png",
87
- "feature_retrieval_library": "added_IVF2170_Flat_nprobe_1_baetalk_v2_mbkm.index",
88
- "author": "dacoolkid44 & Hijack"
89
- },
90
- "Shiori": {
91
- "enable": true,
92
- "model_path": "Shiori_MahdeenSky.pth",
93
- "title": "Shiori Novella",
94
- "cover": "cover.png",
95
- "feature_retrieval_library": "added_IVF647_Flat_nprobe_1_ShioriNovella_v2.index",
96
- "author": "MahdeenSky"
97
- },
98
- "Bijou": {
99
- "enable": true,
100
- "model_path": "Bijou_RigidSpinner.pth",
101
- "title": "Koseki Bijou",
102
- "cover": "cover.png",
103
- "feature_retrieval_library": "added_IVF4915_Flat_nprobe_1_KosekiBijou_v2_mbkm.index",
104
- "author": "RigidSpinner"
105
- },
106
- "Nerissa": {
107
- "enable": true,
108
- "model_path": "Nerissa_RigidSpinner.pth",
109
- "title": "Nerissa Ravencroft",
110
- "cover": "cover.png",
111
- "feature_retrieval_library": "added_IVF4968_Flat_nprobe_1_NerissaRavencroft_v2_mbkm.index",
112
- "author": "RigidSpinner"
113
- },
114
- "Fuwawa": {
115
- "enable": true,
116
- "model_path": "Fuwawa_Listra92.pth",
117
- "title": "Fuwawa Abyssgard",
118
- "cover": "cover.png",
119
- "feature_retrieval_library": "added_IVF2671_Flat_nprobe_1_fuwawa_v2_mbkm.index",
120
- "author": "Listra92"
121
- },
122
- "Mococo": {
123
- "enable": true,
124
- "model_path": "Mococo_LeoFrixi.pth",
125
- "title": "Mococo Abyssgard",
126
- "cover": "cover.png",
127
- "feature_retrieval_library": "added_IVF950_Flat_nprobe_1_Mococo_AbyssgardV2_v2_mbkm.index",
128
- "author": "Leo_Frixi"
129
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  }
 
1
+ {
2
+ "Kiara": {
3
+ "enable": true,
4
+ "model_path": "Kiara_Dacoolkid.pth",
5
+ "title": "Takanashi Kiara",
6
+ "cover": "cover.png",
7
+ "feature_retrieval_library": "added_IVF4961_Flat_nprobe_1_mbkm.index",
8
+ "author": "dacoolkid44 & Hijack"
9
+ },
10
+ "Calliope": {
11
+ "enable": true,
12
+ "model_path": "Calli_RigidSpinner.pth",
13
+ "title": "Mori Calliope",
14
+ "cover": "cover.png",
15
+ "feature_retrieval_library": "added_IVF4173_Flat_nprobe_1_CalliopeMori_v2_mbkm.index",
16
+ "author": "RigidSpinner"
17
+ },
18
+ "Ina": {
19
+ "enable": true,
20
+ "model_path": "Ina_Dacoolkid.pth",
21
+ "title": "Ninomae Ina'nis",
22
+ "cover": "cover.png",
23
+ "feature_retrieval_library": "added_IVF1754_Flat_nprobe_1_Inatalk2_v2_mbkm.index",
24
+ "author": "dacoolkid44 & Hijack"
25
+ },
26
+ "Gura": {
27
+ "enable": true,
28
+ "model_path": "Gura_Mustar.pth",
29
+ "title": "Gawr Gura",
30
+ "cover": "cover.png",
31
+ "feature_retrieval_library": "added_IVF256_Flat_nprobe_1_Gura_v2.index",
32
+ "author": "MUSTAR"
33
+ },
34
+ "Amelia": {
35
+ "enable": true,
36
+ "model_path": "Amelia_Dacoolkid.pth",
37
+ "title": "Amelia Watson",
38
+ "cover": "cover.png",
39
+ "feature_retrieval_library": "added_IVF4964_Flat_nprobe_1_Amelia_v2_mbkm.index",
40
+ "author": "dacoolkid44 & Hijack"
41
+ },
42
+ "IRyS": {
43
+ "enable": true,
44
+ "model_path": "Irys_Mimizukari.pth",
45
+ "title": "IRyS",
46
+ "cover": "cover.png",
47
+ "feature_retrieval_library": "added_IVF3197_Flat_nprobe_1_Irys_v2_mbkm.index",
48
+ "author": "Mimizukari"
49
+ },
50
+ "Sana": {
51
+ "enable": true,
52
+ "model_path": "Sana_KitLemonfoot.pth",
53
+ "title": "Tsukumo Sana",
54
+ "cover": "cover.png",
55
+ "feature_retrieval_library": "added_IVF3032_Flat_nprobe_1_v2_mbkm.index",
56
+ "author": "Kit Lemonfoot / NSHFB"
57
+ },
58
+ "Fauna": {
59
+ "enable": true,
60
+ "model_path": "Fauna_TataSoto.pth",
61
+ "title": "Ceres Fauna",
62
+ "cover": "cover.png",
63
+ "feature_retrieval_library": "fauna_mbkm.index",
64
+ "author": "TataSoto"
65
+ },
66
+ "Kronii": {
67
+ "enable": true,
68
+ "model_path": "Kronii_Dacoolkid.pth",
69
+ "title": "Ouro Kronii",
70
+ "cover": "cover.png",
71
+ "feature_retrieval_library": "added_IVF1728_Flat_nprobe_1_kronii_v2_mbkm.index",
72
+ "author": "dacoolkid44 & Hijack"
73
+ },
74
+ "Mumei": {
75
+ "enable": true,
76
+ "model_path": "Mumei_Dacoolkid.pth",
77
+ "title": "Nanashi Mumei",
78
+ "cover": "cover.png",
79
+ "feature_retrieval_library": "added_IVF1020_Flat_nprobe_1_MUMEI_v2_mbkm.index",
80
+ "author": "dacoolkid44 & Hijack"
81
+ },
82
+ "Baelz": {
83
+ "enable": true,
84
+ "model_path": "Baelz_Dacoolkid.pth",
85
+ "title": "Hakos Baelz",
86
+ "cover": "cover.png",
87
+ "feature_retrieval_library": "added_IVF2170_Flat_nprobe_1_baetalk_v2_mbkm.index",
88
+ "author": "dacoolkid44 & Hijack"
89
+ },
90
+ "Shiori": {
91
+ "enable": true,
92
+ "model_path": "Shiori_MahdeenSky.pth",
93
+ "title": "Shiori Novella",
94
+ "cover": "cover.png",
95
+ "feature_retrieval_library": "added_IVF647_Flat_nprobe_1_ShioriNovella_v2.index",
96
+ "author": "MahdeenSky"
97
+ },
98
+ "Bijou": {
99
+ "enable": true,
100
+ "model_path": "Bijou_RigidSpinner.pth",
101
+ "title": "Koseki Bijou",
102
+ "cover": "cover.png",
103
+ "feature_retrieval_library": "added_IVF4915_Flat_nprobe_1_KosekiBijou_v2_mbkm.index",
104
+ "author": "RigidSpinner"
105
+ },
106
+ "Nerissa": {
107
+ "enable": true,
108
+ "model_path": "Nerissa_RigidSpinner.pth",
109
+ "title": "Nerissa Ravencroft",
110
+ "cover": "cover.png",
111
+ "feature_retrieval_library": "added_IVF4968_Flat_nprobe_1_NerissaRavencroft_v2_mbkm.index",
112
+ "author": "RigidSpinner"
113
+ },
114
+ "Fuwawa": {
115
+ "enable": true,
116
+ "model_path": "Fuwawa_Listra92.pth",
117
+ "title": "Fuwawa Abyssgard",
118
+ "cover": "cover.png",
119
+ "feature_retrieval_library": "added_IVF2671_Flat_nprobe_1_fuwawa_v2_mbkm.index",
120
+ "author": "Listra92"
121
+ },
122
+ "Mococo": {
123
+ "enable": true,
124
+ "model_path": "Mococo_LeoFrixi.pth",
125
+ "title": "Mococo Abyssgard",
126
+ "cover": "cover.png",
127
+ "feature_retrieval_library": "added_IVF950_Flat_nprobe_1_Mococo_AbyssgardV2_v2_mbkm.index",
128
+ "author": "Leo_Frixi"
129
+ },
130
+ "Elizabeth": {
131
+ "enable": true,
132
+ "model_path": "Elizabeth_Dacoolkid.pth",
133
+ "title": "Elizabeth Rose Bloodflame",
134
+ "cover": "cover.png",
135
+ "feature_retrieval_library": "added_IVF1418_Flat_nprobe_1_Elizabeth_Rose_Bloodflame_v2_mbkm.index",
136
+ "author": "dacoolkid44"
137
+ },
138
+ "Gigi": {
139
+ "enable": true,
140
+ "model_path": "Gigi_HinaBl.pth",
141
+ "title": "Gigi Murin",
142
+ "cover": "cover.png",
143
+ "feature_retrieval_library": "added_IVF1648_Flat_nprobe_1_gigi-murin_v2_mbkm.index",
144
+ "author": "HinaBl"
145
+ },
146
+ "Cecilia": {
147
+ "enable": true,
148
+ "model_path": "Cecilia_KitLemonfoot.pth",
149
+ "title": "Cecilia Immergreen",
150
+ "cover": "cover.png",
151
+ "feature_retrieval_library": "added_IVF1477_Flat_nprobe_1_CeciliaImmergreen_Singing_KitLemonfoot_v2_mbkm.index",
152
+ "author": "Kit Lemonfoot / NSHFB"
153
+ },
154
+ "Raora": {
155
+ "enable": true,
156
+ "model_path": "Raora_00a.pth",
157
+ "title": "Raora Panthera",
158
+ "cover": "cover.png",
159
+ "feature_retrieval_library": "added_IVF2050_Flat_nprobe_1_raora_v2_mbkm.index",
160
+ "author": "00a"
161
+ }
162
  }
weights/phaseconnect/Pico/Pico_Sxndypz.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c7d9128cffb2663c4f7cf4df188eef969a66eab87a9bafedd6c9bd424976876
3
+ size 56264557
weights/phaseconnect/Pico/added_IVF1132_Flat_nprobe_1_grampico_v2_mbkm.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6750b29caf83d81bfb85eeb1c3d720002b7acad4fdfdf7a18afeeaac899cc2ff
3
+ size 31588619
weights/phaseconnect/Pico/cover.png ADDED
weights/phaseconnect/model_info.json CHANGED
@@ -151,6 +151,14 @@
151
  "feature_retrieval_library": "added_IVF725_Flat_nprobe_1_eimi_v2.index",
152
  "author": "Sxndypz"
153
  },
 
 
 
 
 
 
 
 
154
  "Memory": {
155
  "enable": true,
156
  "model_path": "Memory_Sxndypz.pth",
@@ -159,12 +167,12 @@
159
  "feature_retrieval_library": "added_IVF930_Flat_nprobe_1_memory_v2_mbkm.index",
160
  "author": "Sxndypz"
161
  },
162
- "Clara": {
163
  "enable": true,
164
- "model_path": "Clara_Sxndypz.pth",
165
- "title": "Kaminari Clara",
166
  "cover": "cover.png",
167
- "feature_retrieval_library": "added_IVF1080_Flat_nprobe_1_clara_v2_mbkm.index",
168
  "author": "Sxndypz"
169
  }
170
  }
 
151
  "feature_retrieval_library": "added_IVF725_Flat_nprobe_1_eimi_v2.index",
152
  "author": "Sxndypz"
153
  },
154
+ "Clara": {
155
+ "enable": true,
156
+ "model_path": "Clara_Sxndypz.pth",
157
+ "title": "Kaminari Clara",
158
+ "cover": "cover.png",
159
+ "feature_retrieval_library": "added_IVF1080_Flat_nprobe_1_clara_v2_mbkm.index",
160
+ "author": "Sxndypz"
161
+ },
162
  "Memory": {
163
  "enable": true,
164
  "model_path": "Memory_Sxndypz.pth",
 
167
  "feature_retrieval_library": "added_IVF930_Flat_nprobe_1_memory_v2_mbkm.index",
168
  "author": "Sxndypz"
169
  },
170
+ "Pico": {
171
  "enable": true,
172
+ "model_path": "Pico_Sxndypz.pth",
173
+ "title": "Gram Pico",
174
  "cover": "cover.png",
175
+ "feature_retrieval_library": "added_IVF1132_Flat_nprobe_1_grampico_v2_mbkm.index",
176
  "author": "Sxndypz"
177
  }
178
  }