Pj12 commited on
Commit
c6114a2
·
verified ·
1 Parent(s): fbe9aca

Delete models Inference.py

Browse files
Files changed (1) hide show
  1. models Inference.py +0 -1381
models Inference.py DELETED
@@ -1,1381 +0,0 @@
1
- import math, pdb, os
2
- from time import time as ttime
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
- from infer_pack import modules
7
- from infer_pack import attentions
8
- from infer_pack import commons
9
- from infer_pack.commons import init_weights, get_padding
10
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from infer_pack.commons import init_weights
13
- import numpy as np
14
- from infer_pack import commons
15
-
16
-
17
- class TextEncoder256(nn.Module):
18
- def __init__(
19
- self,
20
- out_channels,
21
- hidden_channels,
22
- filter_channels,
23
- n_heads,
24
- n_layers,
25
- kernel_size,
26
- p_dropout,
27
- f0=True,
28
- ):
29
- super().__init__()
30
- self.out_channels = out_channels
31
- self.hidden_channels = hidden_channels
32
- self.filter_channels = filter_channels
33
- self.n_heads = n_heads
34
- self.n_layers = n_layers
35
- self.kernel_size = kernel_size
36
- self.p_dropout = p_dropout
37
- self.emb_phone = nn.Linear(256, hidden_channels)
38
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
- if f0 == True:
40
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
- self.encoder = attentions.Encoder(
42
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
- )
44
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
-
46
- def forward(self, phone, pitch, lengths):
47
- if pitch == None:
48
- x = self.emb_phone(phone)
49
- else:
50
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
- x = self.lrelu(x)
53
- x = torch.transpose(x, 1, -1) # [b, h, t]
54
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
- x.dtype
56
- )
57
- x = self.encoder(x * x_mask, x_mask)
58
- stats = self.proj(x) * x_mask
59
-
60
- m, logs = torch.split(stats, self.out_channels, dim=1)
61
- return m, logs, x_mask
62
-
63
-
64
- class TextEncoder768(nn.Module):
65
- def __init__(
66
- self,
67
- out_channels,
68
- hidden_channels,
69
- filter_channels,
70
- n_heads,
71
- n_layers,
72
- kernel_size,
73
- p_dropout,
74
- f0=True,
75
- ):
76
- super().__init__()
77
- self.out_channels = out_channels
78
- self.hidden_channels = hidden_channels
79
- self.filter_channels = filter_channels
80
- self.n_heads = n_heads
81
- self.n_layers = n_layers
82
- self.kernel_size = kernel_size
83
- self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(768, hidden_channels)
85
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
- if f0 == True:
87
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
- self.encoder = attentions.Encoder(
89
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
- )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
-
93
- def forward(self, phone, pitch, lengths):
94
- if pitch == None:
95
- x = self.emb_phone(phone)
96
- else:
97
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
- x = self.lrelu(x)
100
- x = torch.transpose(x, 1, -1) # [b, h, t]
101
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
- x.dtype
103
- )
104
- x = self.encoder(x * x_mask, x_mask)
105
- stats = self.proj(x) * x_mask
106
-
107
- m, logs = torch.split(stats, self.out_channels, dim=1)
108
- return m, logs, x_mask
109
-
110
- class TextEncoder1024(nn.Module):
111
- def __init__(
112
- self,
113
- out_channels,
114
- hidden_channels,
115
- filter_channels,
116
- n_heads,
117
- n_layers,
118
- kernel_size,
119
- p_dropout,
120
- f0=True,
121
- ):
122
- super().__init__()
123
- self.out_channels = out_channels
124
- self.hidden_channels = hidden_channels
125
- self.filter_channels = filter_channels
126
- self.n_heads = n_heads
127
- self.n_layers = n_layers
128
- self.kernel_size = kernel_size
129
- self.p_dropout = p_dropout
130
- self.emb_phone = nn.Linear(1024, hidden_channels)
131
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
132
- if f0 == True:
133
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
134
- self.encoder = attentions.Encoder(
135
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
136
- )
137
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
138
-
139
- def forward(self, phone, pitch, lengths):
140
- if pitch == None:
141
- x = self.emb_phone(phone)
142
- else:
143
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
144
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
145
- x = self.lrelu(x)
146
- x = torch.transpose(x, 1, -1) # [b, h, t]
147
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
148
- x.dtype
149
- )
150
- x = self.encoder(x * x_mask, x_mask)
151
- stats = self.proj(x) * x_mask
152
-
153
- m, logs = torch.split(stats, self.out_channels, dim=1)
154
- return m, logs, x_mask
155
-
156
-
157
- class ResidualCouplingBlock(nn.Module):
158
- def __init__(
159
- self,
160
- channels,
161
- hidden_channels,
162
- kernel_size,
163
- dilation_rate,
164
- n_layers,
165
- n_flows=4,
166
- gin_channels=0,
167
- ):
168
- super().__init__()
169
- self.channels = channels
170
- self.hidden_channels = hidden_channels
171
- self.kernel_size = kernel_size
172
- self.dilation_rate = dilation_rate
173
- self.n_layers = n_layers
174
- self.n_flows = n_flows
175
- self.gin_channels = gin_channels
176
-
177
- self.flows = nn.ModuleList()
178
- for i in range(n_flows):
179
- self.flows.append(
180
- modules.ResidualCouplingLayer(
181
- channels,
182
- hidden_channels,
183
- kernel_size,
184
- dilation_rate,
185
- n_layers,
186
- gin_channels=gin_channels,
187
- mean_only=True,
188
- )
189
- )
190
- self.flows.append(modules.Flip())
191
-
192
- def forward(self, x, x_mask, g=None, reverse=False):
193
- if not reverse:
194
- for flow in self.flows:
195
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
196
- else:
197
- for flow in reversed(self.flows):
198
- x = flow(x, x_mask, g=g, reverse=reverse)
199
- return x
200
-
201
- def remove_weight_norm(self):
202
- for i in range(self.n_flows):
203
- self.flows[i * 2].remove_weight_norm()
204
-
205
-
206
- class PosteriorEncoder(nn.Module):
207
- def __init__(
208
- self,
209
- in_channels,
210
- out_channels,
211
- hidden_channels,
212
- kernel_size,
213
- dilation_rate,
214
- n_layers,
215
- gin_channels=0,
216
- ):
217
- super().__init__()
218
- self.in_channels = in_channels
219
- self.out_channels = out_channels
220
- self.hidden_channels = hidden_channels
221
- self.kernel_size = kernel_size
222
- self.dilation_rate = dilation_rate
223
- self.n_layers = n_layers
224
- self.gin_channels = gin_channels
225
-
226
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
227
- self.enc = modules.WN(
228
- hidden_channels,
229
- kernel_size,
230
- dilation_rate,
231
- n_layers,
232
- gin_channels=gin_channels,
233
- )
234
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
235
-
236
- def forward(self, x, x_lengths, g=None):
237
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
238
- x.dtype
239
- )
240
- x = self.pre(x) * x_mask
241
- x = self.enc(x, x_mask, g=g)
242
- stats = self.proj(x) * x_mask
243
- m, logs = torch.split(stats, self.out_channels, dim=1)
244
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
245
- return z, m, logs, x_mask
246
-
247
- def remove_weight_norm(self):
248
- self.enc.remove_weight_norm()
249
-
250
-
251
- class Generator(torch.nn.Module):
252
- def __init__(
253
- self,
254
- initial_channel,
255
- resblock,
256
- resblock_kernel_sizes,
257
- resblock_dilation_sizes,
258
- upsample_rates,
259
- upsample_initial_channel,
260
- upsample_kernel_sizes,
261
- gin_channels=0,
262
- ):
263
- super(Generator, self).__init__()
264
- self.num_kernels = len(resblock_kernel_sizes)
265
- self.num_upsamples = len(upsample_rates)
266
- self.conv_pre = Conv1d(
267
- initial_channel, upsample_initial_channel, 7, 1, padding=3
268
- )
269
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
270
-
271
- self.ups = nn.ModuleList()
272
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
273
- self.ups.append(
274
- weight_norm(
275
- ConvTranspose1d(
276
- upsample_initial_channel // (2**i),
277
- upsample_initial_channel // (2 ** (i + 1)),
278
- k,
279
- u,
280
- padding=(k - u) // 2,
281
- )
282
- )
283
- )
284
-
285
- self.resblocks = nn.ModuleList()
286
- for i in range(len(self.ups)):
287
- ch = upsample_initial_channel // (2 ** (i + 1))
288
- for j, (k, d) in enumerate(
289
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
290
- ):
291
- self.resblocks.append(resblock(ch, k, d))
292
-
293
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
294
- self.ups.apply(init_weights)
295
-
296
- if gin_channels != 0:
297
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
298
-
299
- def forward(self, x, g=None):
300
- x = self.conv_pre(x)
301
- if g is not None:
302
- x = x + self.cond(g)
303
-
304
- for i in range(self.num_upsamples):
305
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
306
- x = self.ups[i](x)
307
- xs = None
308
- for j in range(self.num_kernels):
309
- if xs is None:
310
- xs = self.resblocks[i * self.num_kernels + j](x)
311
- else:
312
- xs += self.resblocks[i * self.num_kernels + j](x)
313
- x = xs / self.num_kernels
314
- x = F.leaky_relu(x)
315
- x = self.conv_post(x)
316
- x = torch.tanh(x)
317
-
318
- return x
319
-
320
- def remove_weight_norm(self):
321
- for l in self.ups:
322
- remove_weight_norm(l)
323
- for l in self.resblocks:
324
- l.remove_weight_norm()
325
-
326
-
327
- class SineGen(torch.nn.Module):
328
- """Definition of sine generator
329
- SineGen(samp_rate, harmonic_num = 0,
330
- sine_amp = 0.1, noise_std = 0.003,
331
- voiced_threshold = 0,
332
- flag_for_pulse=False)
333
- samp_rate: sampling rate in Hz
334
- harmonic_num: number of harmonic overtones (default 0)
335
- sine_amp: amplitude of sine-wavefrom (default 0.1)
336
- noise_std: std of Gaussian noise (default 0.003)
337
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
338
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
339
- Note: when flag_for_pulse is True, the first time step of a voiced
340
- segment is always sin(np.pi) or cos(0)
341
- """
342
-
343
- def __init__(
344
- self,
345
- samp_rate,
346
- harmonic_num=0,
347
- sine_amp=0.1,
348
- noise_std=0.003,
349
- voiced_threshold=0,
350
- flag_for_pulse=False,
351
- ):
352
- super(SineGen, self).__init__()
353
- self.sine_amp = sine_amp
354
- self.noise_std = noise_std
355
- self.harmonic_num = harmonic_num
356
- self.dim = self.harmonic_num + 1
357
- self.sampling_rate = samp_rate
358
- self.voiced_threshold = voiced_threshold
359
-
360
- def _f02uv(self, f0):
361
- # generate uv signal
362
- uv = torch.ones_like(f0)
363
- uv = uv * (f0 > self.voiced_threshold)
364
- return uv
365
-
366
- def forward(self, f0, upp):
367
- """sine_tensor, uv = forward(f0)
368
- input F0: tensor(batchsize=1, length, dim=1)
369
- f0 for unvoiced steps should be 0
370
- output sine_tensor: tensor(batchsize=1, length, dim)
371
- output uv: tensor(batchsize=1, length, 1)
372
- """
373
- with torch.no_grad():
374
- f0 = f0[:, None].transpose(1, 2)
375
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
376
- # fundamental component
377
- f0_buf[:, :, 0] = f0[:, :, 0]
378
- for idx in np.arange(self.harmonic_num):
379
- f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
380
- idx + 2
381
- ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
382
- rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
383
- rand_ini = torch.rand(
384
- f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
385
- )
386
- rand_ini[:, 0] = 0
387
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
388
- tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
389
- tmp_over_one *= upp
390
- tmp_over_one = F.interpolate(
391
- tmp_over_one.transpose(2, 1),
392
- scale_factor=upp,
393
- mode="linear",
394
- align_corners=True,
395
- ).transpose(2, 1)
396
- rad_values = F.interpolate(
397
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
398
- ).transpose(
399
- 2, 1
400
- ) #######
401
- tmp_over_one %= 1
402
- tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
403
- cumsum_shift = torch.zeros_like(rad_values)
404
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
405
- sine_waves = torch.sin(
406
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
407
- )
408
- sine_waves = sine_waves * self.sine_amp
409
- uv = self._f02uv(f0)
410
- uv = F.interpolate(
411
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
412
- ).transpose(2, 1)
413
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
414
- noise = noise_amp * torch.randn_like(sine_waves)
415
- sine_waves = sine_waves * uv + noise
416
- return sine_waves, uv, noise
417
-
418
-
419
- class SourceModuleHnNSF(torch.nn.Module):
420
- """SourceModule for hn-nsf
421
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
422
- add_noise_std=0.003, voiced_threshod=0)
423
- sampling_rate: sampling_rate in Hz
424
- harmonic_num: number of harmonic above F0 (default: 0)
425
- sine_amp: amplitude of sine source signal (default: 0.1)
426
- add_noise_std: std of additive Gaussian noise (default: 0.003)
427
- note that amplitude of noise in unvoiced is decided
428
- by sine_amp
429
- voiced_threshold: threhold to set U/V given F0 (default: 0)
430
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
431
- F0_sampled (batchsize, length, 1)
432
- Sine_source (batchsize, length, 1)
433
- noise_source (batchsize, length 1)
434
- uv (batchsize, length, 1)
435
- """
436
-
437
- def __init__(
438
- self,
439
- sampling_rate,
440
- harmonic_num=0,
441
- sine_amp=0.1,
442
- add_noise_std=0.003,
443
- voiced_threshod=0,
444
- is_half=True,
445
- ):
446
- super(SourceModuleHnNSF, self).__init__()
447
-
448
- self.sine_amp = sine_amp
449
- self.noise_std = add_noise_std
450
- self.is_half = is_half
451
- # to produce sine waveforms
452
- self.l_sin_gen = SineGen(
453
- sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
454
- )
455
-
456
- # to merge source harmonics into a single excitation
457
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
458
- self.l_tanh = torch.nn.Tanh()
459
-
460
- def forward(self, x, upp=None):
461
- sine_wavs, uv, _ = self.l_sin_gen(x, upp)
462
- if self.is_half:
463
- sine_wavs = sine_wavs.half()
464
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
465
- return sine_merge, None, None # noise, uv
466
-
467
-
468
- class GeneratorNSF(torch.nn.Module):
469
- def __init__(
470
- self,
471
- initial_channel,
472
- resblock,
473
- resblock_kernel_sizes,
474
- resblock_dilation_sizes,
475
- upsample_rates,
476
- upsample_initial_channel,
477
- upsample_kernel_sizes,
478
- gin_channels,
479
- sr,
480
- is_half=False,
481
- ):
482
- super(GeneratorNSF, self).__init__()
483
- self.num_kernels = len(resblock_kernel_sizes)
484
- self.num_upsamples = len(upsample_rates)
485
-
486
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
487
- self.m_source = SourceModuleHnNSF(
488
- sampling_rate=sr, harmonic_num=0, is_half=is_half
489
- )
490
- self.noise_convs = nn.ModuleList()
491
- self.conv_pre = Conv1d(
492
- initial_channel, upsample_initial_channel, 7, 1, padding=3
493
- )
494
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
495
-
496
- self.ups = nn.ModuleList()
497
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
498
- c_cur = upsample_initial_channel // (2 ** (i + 1))
499
- self.ups.append(
500
- weight_norm(
501
- ConvTranspose1d(
502
- upsample_initial_channel // (2**i),
503
- upsample_initial_channel // (2 ** (i + 1)),
504
- k,
505
- u,
506
- padding=(k - u) // 2,
507
- )
508
- )
509
- )
510
- if i + 1 < len(upsample_rates):
511
- stride_f0 = np.prod(upsample_rates[i + 1 :])
512
- self.noise_convs.append(
513
- Conv1d(
514
- 1,
515
- c_cur,
516
- kernel_size=stride_f0 * 2,
517
- stride=stride_f0,
518
- padding=stride_f0 // 2,
519
- )
520
- )
521
- else:
522
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
523
-
524
- self.resblocks = nn.ModuleList()
525
- for i in range(len(self.ups)):
526
- ch = upsample_initial_channel // (2 ** (i + 1))
527
- for j, (k, d) in enumerate(
528
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
529
- ):
530
- self.resblocks.append(resblock(ch, k, d))
531
-
532
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
533
- self.ups.apply(init_weights)
534
-
535
- if gin_channels != 0:
536
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
537
-
538
- self.upp = np.prod(upsample_rates)
539
-
540
- def forward(self, x, f0, g=None):
541
- har_source, noi_source, uv = self.m_source(f0, self.upp)
542
- har_source = har_source.transpose(1, 2)
543
- x = self.conv_pre(x)
544
- if g is not None:
545
- x = x + self.cond(g)
546
-
547
- for i in range(self.num_upsamples):
548
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
549
- x = self.ups[i](x)
550
- x_source = self.noise_convs[i](har_source)
551
- x = x + x_source
552
- xs = None
553
- for j in range(self.num_kernels):
554
- if xs is None:
555
- xs = self.resblocks[i * self.num_kernels + j](x)
556
- else:
557
- xs += self.resblocks[i * self.num_kernels + j](x)
558
- x = xs / self.num_kernels
559
- x = F.leaky_relu(x)
560
- x = self.conv_post(x)
561
- x = torch.tanh(x)
562
- return x
563
-
564
- def remove_weight_norm(self):
565
- for l in self.ups:
566
- remove_weight_norm(l)
567
- for l in self.resblocks:
568
- l.remove_weight_norm()
569
-
570
-
571
- sr2sr = {
572
- "32k": 32000,
573
- "40k": 40000,
574
- "48k": 48000,
575
- }
576
-
577
-
578
- class SynthesizerTrnMs256NSFsid(nn.Module):
579
- def __init__(
580
- self,
581
- spec_channels,
582
- segment_size,
583
- inter_channels,
584
- hidden_channels,
585
- filter_channels,
586
- n_heads,
587
- n_layers,
588
- kernel_size,
589
- p_dropout,
590
- resblock,
591
- resblock_kernel_sizes,
592
- resblock_dilation_sizes,
593
- upsample_rates,
594
- upsample_initial_channel,
595
- upsample_kernel_sizes,
596
- spk_embed_dim,
597
- gin_channels,
598
- sr,
599
- **kwargs
600
- ):
601
- super().__init__()
602
- if type(sr) == type("strr"):
603
- sr = sr2sr[sr]
604
- self.spec_channels = spec_channels
605
- self.inter_channels = inter_channels
606
- self.hidden_channels = hidden_channels
607
- self.filter_channels = filter_channels
608
- self.n_heads = n_heads
609
- self.n_layers = n_layers
610
- self.kernel_size = kernel_size
611
- self.p_dropout = p_dropout
612
- self.resblock = resblock
613
- self.resblock_kernel_sizes = resblock_kernel_sizes
614
- self.resblock_dilation_sizes = resblock_dilation_sizes
615
- self.upsample_rates = upsample_rates
616
- self.upsample_initial_channel = upsample_initial_channel
617
- self.upsample_kernel_sizes = upsample_kernel_sizes
618
- self.segment_size = segment_size
619
- self.gin_channels = gin_channels
620
- # self.hop_length = hop_length#
621
- self.spk_embed_dim = spk_embed_dim
622
- self.enc_p = TextEncoder256(
623
- inter_channels,
624
- hidden_channels,
625
- filter_channels,
626
- n_heads,
627
- n_layers,
628
- kernel_size,
629
- p_dropout,
630
- )
631
- self.dec = GeneratorNSF(
632
- inter_channels,
633
- resblock,
634
- resblock_kernel_sizes,
635
- resblock_dilation_sizes,
636
- upsample_rates,
637
- upsample_initial_channel,
638
- upsample_kernel_sizes,
639
- gin_channels=gin_channels,
640
- sr=sr,
641
- is_half=kwargs["is_half"],
642
- )
643
- self.enc_q = PosteriorEncoder(
644
- spec_channels,
645
- inter_channels,
646
- hidden_channels,
647
- 5,
648
- 1,
649
- 16,
650
- gin_channels=gin_channels,
651
- )
652
- self.flow = ResidualCouplingBlock(
653
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
654
- )
655
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
656
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
657
-
658
- def remove_weight_norm(self):
659
- self.dec.remove_weight_norm()
660
- self.flow.remove_weight_norm()
661
- self.enc_q.remove_weight_norm()
662
-
663
- def forward(
664
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
665
- ): # 这里ds是id,[bs,1]
666
- # print(1,pitch.shape)#[bs,t]
667
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
668
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
669
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
670
- z_p = self.flow(z, y_mask, g=g)
671
- z_slice, ids_slice = commons.rand_slice_segments(
672
- z, y_lengths, self.segment_size
673
- )
674
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
675
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
676
- # print(-2,pitchf.shape,z_slice.shape)
677
- o = self.dec(z_slice, pitchf, g=g)
678
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
679
-
680
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
681
- g = self.emb_g(sid).unsqueeze(-1)
682
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
683
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
684
- z = self.flow(z_p, x_mask, g=g, reverse=True)
685
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
686
- return o, x_mask, (z, z_p, m_p, logs_p)
687
-
688
-
689
- class SynthesizerTrnMs768NSFsid(nn.Module):
690
- def __init__(
691
- self,
692
- spec_channels,
693
- segment_size,
694
- inter_channels,
695
- hidden_channels,
696
- filter_channels,
697
- n_heads,
698
- n_layers,
699
- kernel_size,
700
- p_dropout,
701
- resblock,
702
- resblock_kernel_sizes,
703
- resblock_dilation_sizes,
704
- upsample_rates,
705
- upsample_initial_channel,
706
- upsample_kernel_sizes,
707
- spk_embed_dim,
708
- gin_channels,
709
- sr,
710
- **kwargs
711
- ):
712
- super().__init__()
713
- if type(sr) == type("strr"):
714
- sr = sr2sr[sr]
715
- self.spec_channels = spec_channels
716
- self.inter_channels = inter_channels
717
- self.hidden_channels = hidden_channels
718
- self.filter_channels = filter_channels
719
- self.n_heads = n_heads
720
- self.n_layers = n_layers
721
- self.kernel_size = kernel_size
722
- self.p_dropout = p_dropout
723
- self.resblock = resblock
724
- self.resblock_kernel_sizes = resblock_kernel_sizes
725
- self.resblock_dilation_sizes = resblock_dilation_sizes
726
- self.upsample_rates = upsample_rates
727
- self.upsample_initial_channel = upsample_initial_channel
728
- self.upsample_kernel_sizes = upsample_kernel_sizes
729
- self.segment_size = segment_size
730
- self.gin_channels = gin_channels
731
- # self.hop_length = hop_length#
732
- self.spk_embed_dim = spk_embed_dim
733
- self.enc_p = TextEncoder768(
734
- inter_channels,
735
- hidden_channels,
736
- filter_channels,
737
- n_heads,
738
- n_layers,
739
- kernel_size,
740
- p_dropout,
741
- )
742
- self.dec = GeneratorNSF(
743
- inter_channels,
744
- resblock,
745
- resblock_kernel_sizes,
746
- resblock_dilation_sizes,
747
- upsample_rates,
748
- upsample_initial_channel,
749
- upsample_kernel_sizes,
750
- gin_channels=gin_channels,
751
- sr=sr,
752
- is_half=kwargs["is_half"],
753
- )
754
- self.enc_q = PosteriorEncoder(
755
- spec_channels,
756
- inter_channels,
757
- hidden_channels,
758
- 5,
759
- 1,
760
- 16,
761
- gin_channels=gin_channels,
762
- )
763
- self.flow = ResidualCouplingBlock(
764
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
765
- )
766
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
767
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
768
-
769
- def remove_weight_norm(self):
770
- self.dec.remove_weight_norm()
771
- self.flow.remove_weight_norm()
772
- self.enc_q.remove_weight_norm()
773
-
774
- def forward(
775
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
776
- ): # 这里ds是id,[bs,1]
777
- # print(1,pitch.shape)#[bs,t]
778
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
779
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
780
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
781
- z_p = self.flow(z, y_mask, g=g)
782
- z_slice, ids_slice = commons.rand_slice_segments(
783
- z, y_lengths, self.segment_size
784
- )
785
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
786
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
787
- # print(-2,pitchf.shape,z_slice.shape)
788
- o = self.dec(z_slice, pitchf, g=g)
789
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
790
-
791
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
792
- g = self.emb_g(sid).unsqueeze(-1)
793
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
794
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
795
- z = self.flow(z_p, x_mask, g=g, reverse=True)
796
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
797
- return o, x_mask, (z, z_p, m_p, logs_p)
798
-
799
- class SynthesizerTrnMs1024NSFsid(nn.Module):
800
- def __init__(
801
- self,
802
- spec_channels,
803
- segment_size,
804
- inter_channels,
805
- hidden_channels,
806
- filter_channels,
807
- n_heads,
808
- n_layers,
809
- kernel_size,
810
- p_dropout,
811
- resblock,
812
- resblock_kernel_sizes,
813
- resblock_dilation_sizes,
814
- upsample_rates,
815
- upsample_initial_channel,
816
- upsample_kernel_sizes,
817
- spk_embed_dim,
818
- gin_channels,
819
- sr,
820
- **kwargs
821
- ):
822
- super().__init__()
823
- if type(sr) == type("strr"):
824
- sr = sr2sr[sr]
825
- self.spec_channels = spec_channels
826
- self.inter_channels = inter_channels
827
- self.hidden_channels = hidden_channels
828
- self.filter_channels = filter_channels
829
- self.n_heads = n_heads
830
- self.n_layers = n_layers
831
- self.kernel_size = kernel_size
832
- self.p_dropout = p_dropout
833
- self.resblock = resblock
834
- self.resblock_kernel_sizes = resblock_kernel_sizes
835
- self.resblock_dilation_sizes = resblock_dilation_sizes
836
- self.upsample_rates = upsample_rates
837
- self.upsample_initial_channel = upsample_initial_channel
838
- self.upsample_kernel_sizes = upsample_kernel_sizes
839
- self.segment_size = segment_size
840
- self.gin_channels = gin_channels
841
- # self.hop_length = hop_length#
842
- self.spk_embed_dim = spk_embed_dim
843
- self.enc_p = TextEncoder1024(
844
- inter_channels,
845
- hidden_channels,
846
- filter_channels,
847
- n_heads,
848
- n_layers,
849
- kernel_size,
850
- p_dropout,
851
- )
852
- self.dec = GeneratorNSF(
853
- inter_channels,
854
- resblock,
855
- resblock_kernel_sizes,
856
- resblock_dilation_sizes,
857
- upsample_rates,
858
- upsample_initial_channel,
859
- upsample_kernel_sizes,
860
- gin_channels=gin_channels,
861
- sr=sr,
862
- is_half=kwargs["is_half"],
863
- )
864
- self.enc_q = PosteriorEncoder(
865
- spec_channels,
866
- inter_channels,
867
- hidden_channels,
868
- 5,
869
- 1,
870
- 16,
871
- gin_channels=gin_channels,
872
- )
873
- self.flow = ResidualCouplingBlock(
874
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
875
- )
876
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
877
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
878
-
879
- def remove_weight_norm(self):
880
- self.dec.remove_weight_norm()
881
- self.flow.remove_weight_norm()
882
- self.enc_q.remove_weight_norm()
883
-
884
- def forward(
885
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
886
- ): # 这里ds是id,[bs,1]
887
- # print(1,pitch.shape)#[bs,t]
888
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
889
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
890
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
891
- z_p = self.flow(z, y_mask, g=g)
892
- z_slice, ids_slice = commons.rand_slice_segments(
893
- z, y_lengths, self.segment_size
894
- )
895
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
896
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
897
- # print(-2,pitchf.shape,z_slice.shape)
898
- o = self.dec(z_slice, pitchf, g=g)
899
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
900
-
901
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
902
- g = self.emb_g(sid).unsqueeze(-1)
903
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
904
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
905
- z = self.flow(z_p, x_mask, g=g, reverse=True)
906
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
907
- return o, x_mask, (z, z_p, m_p, logs_p)
908
-
909
-
910
- class SynthesizerTrnMs256NSFsid_nono(nn.Module):
911
- def __init__(
912
- self,
913
- spec_channels,
914
- segment_size,
915
- inter_channels,
916
- hidden_channels,
917
- filter_channels,
918
- n_heads,
919
- n_layers,
920
- kernel_size,
921
- p_dropout,
922
- resblock,
923
- resblock_kernel_sizes,
924
- resblock_dilation_sizes,
925
- upsample_rates,
926
- upsample_initial_channel,
927
- upsample_kernel_sizes,
928
- spk_embed_dim,
929
- gin_channels,
930
- sr=None,
931
- **kwargs
932
- ):
933
- super().__init__()
934
- self.spec_channels = spec_channels
935
- self.inter_channels = inter_channels
936
- self.hidden_channels = hidden_channels
937
- self.filter_channels = filter_channels
938
- self.n_heads = n_heads
939
- self.n_layers = n_layers
940
- self.kernel_size = kernel_size
941
- self.p_dropout = p_dropout
942
- self.resblock = resblock
943
- self.resblock_kernel_sizes = resblock_kernel_sizes
944
- self.resblock_dilation_sizes = resblock_dilation_sizes
945
- self.upsample_rates = upsample_rates
946
- self.upsample_initial_channel = upsample_initial_channel
947
- self.upsample_kernel_sizes = upsample_kernel_sizes
948
- self.segment_size = segment_size
949
- self.gin_channels = gin_channels
950
- # self.hop_length = hop_length#
951
- self.spk_embed_dim = spk_embed_dim
952
- self.enc_p = TextEncoder256(
953
- inter_channels,
954
- hidden_channels,
955
- filter_channels,
956
- n_heads,
957
- n_layers,
958
- kernel_size,
959
- p_dropout,
960
- f0=False,
961
- )
962
- self.dec = Generator(
963
- inter_channels,
964
- resblock,
965
- resblock_kernel_sizes,
966
- resblock_dilation_sizes,
967
- upsample_rates,
968
- upsample_initial_channel,
969
- upsample_kernel_sizes,
970
- gin_channels=gin_channels,
971
- )
972
- self.enc_q = PosteriorEncoder(
973
- spec_channels,
974
- inter_channels,
975
- hidden_channels,
976
- 5,
977
- 1,
978
- 16,
979
- gin_channels=gin_channels,
980
- )
981
- self.flow = ResidualCouplingBlock(
982
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
983
- )
984
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
985
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
986
-
987
- def remove_weight_norm(self):
988
- self.dec.remove_weight_norm()
989
- self.flow.remove_weight_norm()
990
- self.enc_q.remove_weight_norm()
991
-
992
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
993
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
994
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
995
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
996
- z_p = self.flow(z, y_mask, g=g)
997
- z_slice, ids_slice = commons.rand_slice_segments(
998
- z, y_lengths, self.segment_size
999
- )
1000
- o = self.dec(z_slice, g=g)
1001
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
1002
-
1003
- def infer(self, phone, phone_lengths, sid, max_len=None):
1004
- g = self.emb_g(sid).unsqueeze(-1)
1005
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1006
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
1007
- z = self.flow(z_p, x_mask, g=g, reverse=True)
1008
- o = self.dec((z * x_mask)[:, :, :max_len], g=g)
1009
- return o, x_mask, (z, z_p, m_p, logs_p)
1010
-
1011
-
1012
- class SynthesizerTrnMs768NSFsid_nono(nn.Module):
1013
- def __init__(
1014
- self,
1015
- spec_channels,
1016
- segment_size,
1017
- inter_channels,
1018
- hidden_channels,
1019
- filter_channels,
1020
- n_heads,
1021
- n_layers,
1022
- kernel_size,
1023
- p_dropout,
1024
- resblock,
1025
- resblock_kernel_sizes,
1026
- resblock_dilation_sizes,
1027
- upsample_rates,
1028
- upsample_initial_channel,
1029
- upsample_kernel_sizes,
1030
- spk_embed_dim,
1031
- gin_channels,
1032
- sr=None,
1033
- **kwargs
1034
- ):
1035
- super().__init__()
1036
- self.spec_channels = spec_channels
1037
- self.inter_channels = inter_channels
1038
- self.hidden_channels = hidden_channels
1039
- self.filter_channels = filter_channels
1040
- self.n_heads = n_heads
1041
- self.n_layers = n_layers
1042
- self.kernel_size = kernel_size
1043
- self.p_dropout = p_dropout
1044
- self.resblock = resblock
1045
- self.resblock_kernel_sizes = resblock_kernel_sizes
1046
- self.resblock_dilation_sizes = resblock_dilation_sizes
1047
- self.upsample_rates = upsample_rates
1048
- self.upsample_initial_channel = upsample_initial_channel
1049
- self.upsample_kernel_sizes = upsample_kernel_sizes
1050
- self.segment_size = segment_size
1051
- self.gin_channels = gin_channels
1052
- # self.hop_length = hop_length#
1053
- self.spk_embed_dim = spk_embed_dim
1054
- self.enc_p = TextEncoder768(
1055
- inter_channels,
1056
- hidden_channels,
1057
- filter_channels,
1058
- n_heads,
1059
- n_layers,
1060
- kernel_size,
1061
- p_dropout,
1062
- f0=False,
1063
- )
1064
- self.dec = Generator(
1065
- inter_channels,
1066
- resblock,
1067
- resblock_kernel_sizes,
1068
- resblock_dilation_sizes,
1069
- upsample_rates,
1070
- upsample_initial_channel,
1071
- upsample_kernel_sizes,
1072
- gin_channels=gin_channels,
1073
- )
1074
- self.enc_q = PosteriorEncoder(
1075
- spec_channels,
1076
- inter_channels,
1077
- hidden_channels,
1078
- 5,
1079
- 1,
1080
- 16,
1081
- gin_channels=gin_channels,
1082
- )
1083
- self.flow = ResidualCouplingBlock(
1084
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
1085
- )
1086
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
1087
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
1088
-
1089
- def remove_weight_norm(self):
1090
- self.dec.remove_weight_norm()
1091
- self.flow.remove_weight_norm()
1092
- self.enc_q.remove_weight_norm()
1093
-
1094
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
1095
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
1096
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1097
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
1098
- z_p = self.flow(z, y_mask, g=g)
1099
- z_slice, ids_slice = commons.rand_slice_segments(
1100
- z, y_lengths, self.segment_size
1101
- )
1102
- o = self.dec(z_slice, g=g)
1103
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
1104
-
1105
- def infer(self, phone, phone_lengths, sid, max_len=None):
1106
- g = self.emb_g(sid).unsqueeze(-1)
1107
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1108
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
1109
- z = self.flow(z_p, x_mask, g=g, reverse=True)
1110
- o = self.dec((z * x_mask)[:, :, :max_len], g=g)
1111
- return o, x_mask, (z, z_p, m_p, logs_p)
1112
-
1113
- class SynthesizerTrnMs1024NSFsid_nono(nn.Module):
1114
- def __init__(
1115
- self,
1116
- spec_channels,
1117
- segment_size,
1118
- inter_channels,
1119
- hidden_channels,
1120
- filter_channels,
1121
- n_heads,
1122
- n_layers,
1123
- kernel_size,
1124
- p_dropout,
1125
- resblock,
1126
- resblock_kernel_sizes,
1127
- resblock_dilation_sizes,
1128
- upsample_rates,
1129
- upsample_initial_channel,
1130
- upsample_kernel_sizes,
1131
- spk_embed_dim,
1132
- gin_channels,
1133
- sr=None,
1134
- **kwargs
1135
- ):
1136
- super().__init__()
1137
- self.spec_channels = spec_channels
1138
- self.inter_channels = inter_channels
1139
- self.hidden_channels = hidden_channels
1140
- self.filter_channels = filter_channels
1141
- self.n_heads = n_heads
1142
- self.n_layers = n_layers
1143
- self.kernel_size = kernel_size
1144
- self.p_dropout = p_dropout
1145
- self.resblock = resblock
1146
- self.resblock_kernel_sizes = resblock_kernel_sizes
1147
- self.resblock_dilation_sizes = resblock_dilation_sizes
1148
- self.upsample_rates = upsample_rates
1149
- self.upsample_initial_channel = upsample_initial_channel
1150
- self.upsample_kernel_sizes = upsample_kernel_sizes
1151
- self.segment_size = segment_size
1152
- self.gin_channels = gin_channels
1153
- # self.hop_length = hop_length#
1154
- self.spk_embed_dim = spk_embed_dim
1155
- self.enc_p = TextEncoder1024(
1156
- inter_channels,
1157
- hidden_channels,
1158
- filter_channels,
1159
- n_heads,
1160
- n_layers,
1161
- kernel_size,
1162
- p_dropout,
1163
- f0=False,
1164
- )
1165
- self.dec = Generator(
1166
- inter_channels,
1167
- resblock,
1168
- resblock_kernel_sizes,
1169
- resblock_dilation_sizes,
1170
- upsample_rates,
1171
- upsample_initial_channel,
1172
- upsample_kernel_sizes,
1173
- gin_channels=gin_channels,
1174
- )
1175
- self.enc_q = PosteriorEncoder(
1176
- spec_channels,
1177
- inter_channels,
1178
- hidden_channels,
1179
- 5,
1180
- 1,
1181
- 16,
1182
- gin_channels=gin_channels,
1183
- )
1184
- self.flow = ResidualCouplingBlock(
1185
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
1186
- )
1187
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
1188
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
1189
-
1190
- def remove_weight_norm(self):
1191
- self.dec.remove_weight_norm()
1192
- self.flow.remove_weight_norm()
1193
- self.enc_q.remove_weight_norm()
1194
-
1195
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
1196
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
1197
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1198
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
1199
- z_p = self.flow(z, y_mask, g=g)
1200
- z_slice, ids_slice = commons.rand_slice_segments(
1201
- z, y_lengths, self.segment_size
1202
- )
1203
- o = self.dec(z_slice, g=g)
1204
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
1205
-
1206
- def infer(self, phone, phone_lengths, sid, max_len=None):
1207
- g = self.emb_g(sid).unsqueeze(-1)
1208
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1209
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
1210
- z = self.flow(z_p, x_mask, g=g, reverse=True)
1211
- o = self.dec((z * x_mask)[:, :, :max_len], g=g)
1212
- return o, x_mask, (z, z_p, m_p, logs_p)
1213
-
1214
-
1215
- class MultiPeriodDiscriminator(torch.nn.Module):
1216
- def __init__(self, use_spectral_norm=False):
1217
- super(MultiPeriodDiscriminator, self).__init__()
1218
- periods = [2, 3, 5, 7, 11, 17]
1219
- # periods = [3, 5, 7, 11, 17, 23, 37]
1220
-
1221
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1222
- discs = discs + [
1223
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1224
- ]
1225
- self.discriminators = nn.ModuleList(discs)
1226
-
1227
- def forward(self, y, y_hat):
1228
- y_d_rs = [] #
1229
- y_d_gs = []
1230
- fmap_rs = []
1231
- fmap_gs = []
1232
- for i, d in enumerate(self.discriminators):
1233
- y_d_r, fmap_r = d(y)
1234
- y_d_g, fmap_g = d(y_hat)
1235
- # for j in range(len(fmap_r)):
1236
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1237
- y_d_rs.append(y_d_r)
1238
- y_d_gs.append(y_d_g)
1239
- fmap_rs.append(fmap_r)
1240
- fmap_gs.append(fmap_g)
1241
-
1242
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1243
-
1244
-
1245
- class MultiPeriodDiscriminatorV2(torch.nn.Module):
1246
- def __init__(self, use_spectral_norm=False):
1247
- super(MultiPeriodDiscriminatorV2, self).__init__()
1248
- # periods = [2, 3, 5, 7, 11, 17]
1249
- periods = [2, 3, 5, 7, 11, 17, 23, 37]
1250
-
1251
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1252
- discs = discs + [
1253
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1254
- ]
1255
- self.discriminators = nn.ModuleList(discs)
1256
-
1257
- def forward(self, y, y_hat):
1258
- y_d_rs = [] #
1259
- y_d_gs = []
1260
- fmap_rs = []
1261
- fmap_gs = []
1262
- for i, d in enumerate(self.discriminators):
1263
- y_d_r, fmap_r = d(y)
1264
- y_d_g, fmap_g = d(y_hat)
1265
- # for j in range(len(fmap_r)):
1266
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1267
- y_d_rs.append(y_d_r)
1268
- y_d_gs.append(y_d_g)
1269
- fmap_rs.append(fmap_r)
1270
- fmap_gs.append(fmap_g)
1271
-
1272
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1273
-
1274
-
1275
- class DiscriminatorS(torch.nn.Module):
1276
- def __init__(self, use_spectral_norm=False):
1277
- super(DiscriminatorS, self).__init__()
1278
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1279
- self.convs = nn.ModuleList(
1280
- [
1281
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1282
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1283
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1284
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1285
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1286
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1287
- ]
1288
- )
1289
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1290
-
1291
- def forward(self, x):
1292
- fmap = []
1293
-
1294
- for l in self.convs:
1295
- x = l(x)
1296
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1297
- fmap.append(x)
1298
- x = self.conv_post(x)
1299
- fmap.append(x)
1300
- x = torch.flatten(x, 1, -1)
1301
-
1302
- return x, fmap
1303
-
1304
-
1305
- class DiscriminatorP(torch.nn.Module):
1306
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1307
- super(DiscriminatorP, self).__init__()
1308
- self.period = period
1309
- self.use_spectral_norm = use_spectral_norm
1310
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1311
- self.convs = nn.ModuleList(
1312
- [
1313
- norm_f(
1314
- Conv2d(
1315
- 1,
1316
- 32,
1317
- (kernel_size, 1),
1318
- (stride, 1),
1319
- padding=(get_padding(kernel_size, 1), 0),
1320
- )
1321
- ),
1322
- norm_f(
1323
- Conv2d(
1324
- 32,
1325
- 128,
1326
- (kernel_size, 1),
1327
- (stride, 1),
1328
- padding=(get_padding(kernel_size, 1), 0),
1329
- )
1330
- ),
1331
- norm_f(
1332
- Conv2d(
1333
- 128,
1334
- 512,
1335
- (kernel_size, 1),
1336
- (stride, 1),
1337
- padding=(get_padding(kernel_size, 1), 0),
1338
- )
1339
- ),
1340
- norm_f(
1341
- Conv2d(
1342
- 512,
1343
- 1024,
1344
- (kernel_size, 1),
1345
- (stride, 1),
1346
- padding=(get_padding(kernel_size, 1), 0),
1347
- )
1348
- ),
1349
- norm_f(
1350
- Conv2d(
1351
- 1024,
1352
- 1024,
1353
- (kernel_size, 1),
1354
- 1,
1355
- padding=(get_padding(kernel_size, 1), 0),
1356
- )
1357
- ),
1358
- ]
1359
- )
1360
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1361
-
1362
- def forward(self, x):
1363
- fmap = []
1364
-
1365
- # 1d to 2d
1366
- b, c, t = x.shape
1367
- if t % self.period != 0: # pad first
1368
- n_pad = self.period - (t % self.period)
1369
- x = F.pad(x, (0, n_pad), "reflect")
1370
- t = t + n_pad
1371
- x = x.view(b, c, t // self.period, self.period)
1372
-
1373
- for l in self.convs:
1374
- x = l(x)
1375
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1376
- fmap.append(x)
1377
- x = self.conv_post(x)
1378
- fmap.append(x)
1379
- x = torch.flatten(x, 1, -1)
1380
-
1381
- return x, fmap