Spaces:

maxmax20160403
/

sovits5.0

Running

App Files Files Community

maxmax20160403 commited on Jun 4, 2023

Commit

c24b656

•

1 Parent(s): c2e61ce

final ver

Browse files

Files changed (17) hide show

app.py +12 -12
configs/base.yaml +16 -16
vits/data_utils.py +5 -3
vits/models.py +14 -5
vits/modules.py +0 -151
vits/modules_grl.py +62 -0
vits/utils.py +1 -20
vits_decoder/__init__.py +1 -0
vits_decoder/alias/act.py +101 -0
vits_decoder/alias/activations.py +0 -0
vits_decoder/bigv.py +3 -70
vits_decoder/discriminator.py +14 -7
vits_decoder/generator.py +27 -28
vits_decoder/med.py +65 -0
vits_decoder/msd.py +29 -0
vits_decoder/nsf.py +10 -25
vits_pretrain/{sovits5.0-48k-debug.pth → sovits5.0_bigvgan.pth} +2 -2

app.py CHANGED Viewed

@@ -60,7 +60,7 @@ def compute_f0_nn(filename, device):
     periodicity = np.repeat(periodicity, 2, -1)  # 320 -> 160 * 2
     # CREPE was not trained on silent audio. some error on silent need filter.
     periodicity = torchcrepe.filter.median(periodicity, 9)
-    pitch = torchcrepe.filter.mean(pitch, 9)
     pitch[periodicity < 0.1] = 0
     pitch = pitch.squeeze(0)
     return pitch
@@ -72,7 +72,7 @@ model = SynthesizerInfer(
     hp.data.filter_length // 2 + 1,
     hp.data.segment_size // hp.data.hop_length,
     hp)
-load_svc_model("vits_pretrain/sovits5.0-48k-debug.pth", model)
 model.eval()
 model.to(device)
@@ -116,17 +116,17 @@ def svc_change(argswave, argsspk):
             has_audio = True
             if (out_index == 0):  # start frame
                 cut_s = out_index
-                cut_s_48k = 0
             else:
                 cut_s = out_index - hop_frame
-                cut_s_48k = hop_frame * hop_size
             if (out_index + out_chunk + hop_frame > all_frame):  # end frame
                 cut_e = out_index + out_chunk
-                cut_e_48k = 0
             else:
                 cut_e = out_index + out_chunk + hop_frame
-                cut_e_48k = -1 * hop_frame * hop_size
             sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
             sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
@@ -136,17 +136,17 @@ def svc_change(argswave, argsspk):
             sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
             sub_out = sub_out[0, 0].data.cpu().detach().numpy()
-            sub_out = sub_out[cut_s_48k:cut_e_48k]
             out_audio.extend(sub_out)
             out_index = out_index + out_chunk
         if (out_index < all_frame):
             if (has_audio):
                 cut_s = out_index - hop_frame
-                cut_s_48k = hop_frame * hop_size
             else:
                 cut_s = 0
-                cut_s_48k = 0
             sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
             sub_pit = pit[cut_s:].unsqueeze(0).to(device)
             sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
@@ -154,7 +154,7 @@ def svc_change(argswave, argsspk):
             sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
             sub_out = sub_out[0, 0].data.cpu().detach().numpy()
-            sub_out = sub_out[cut_s_48k:]
             out_audio.extend(sub_out)
         out_audio = np.asarray(out_audio)
@@ -175,7 +175,7 @@ def svc_main(sid, input_audio):
     wav_path = "temp.wav"
     soundfile.write(wav_path, audio, 16000, format="wav")
     out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
-    return "Success", (48000, out_audio)
 app = gr.Blocks()
@@ -183,7 +183,7 @@ with app:
     with gr.Tabs():
         with gr.TabItem("sovits 5.0"):
             gr.Markdown(value="""
-                基于开源数据:Multi-Singer
                 https://github.com/Multi-Singer/Multi-Singer.github.io

     periodicity = np.repeat(periodicity, 2, -1)  # 320 -> 160 * 2
     # CREPE was not trained on silent audio. some error on silent need filter.
     periodicity = torchcrepe.filter.median(periodicity, 9)
+    pitch = torchcrepe.filter.mean(pitch, 3)
     pitch[periodicity < 0.1] = 0
     pitch = pitch.squeeze(0)
     return pitch
     hp.data.filter_length // 2 + 1,
     hp.data.segment_size // hp.data.hop_length,
     hp)
+load_svc_model("vits_pretrain/sovits5.0_bigvgan.pth", model)
 model.eval()
 model.to(device)
             has_audio = True
             if (out_index == 0):  # start frame
                 cut_s = out_index
+                cut_s_out = 0
             else:
                 cut_s = out_index - hop_frame
+                cut_s_out = hop_frame * hop_size
             if (out_index + out_chunk + hop_frame > all_frame):  # end frame
                 cut_e = out_index + out_chunk
+                cut_e_out = 0
             else:
                 cut_e = out_index + out_chunk + hop_frame
+                cut_e_out = -1 * hop_frame * hop_size
             sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
             sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
             sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
             sub_out = sub_out[0, 0].data.cpu().detach().numpy()
+            sub_out = sub_out[cut_s_out:cut_e_out]
             out_audio.extend(sub_out)
             out_index = out_index + out_chunk
         if (out_index < all_frame):
             if (has_audio):
                 cut_s = out_index - hop_frame
+                cut_s_out = hop_frame * hop_size
             else:
                 cut_s = 0
+                cut_s_out = 0
             sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
             sub_pit = pit[cut_s:].unsqueeze(0).to(device)
             sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
             sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
             sub_out = sub_out[0, 0].data.cpu().detach().numpy()
+            sub_out = sub_out[cut_s_out:]
             out_audio.extend(sub_out)
         out_audio = np.asarray(out_audio)
     wav_path = "temp.wav"
     soundfile.write(wav_path, audio, 16000, format="wav")
     out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
+    return "Success", (32000, out_audio)
 app = gr.Blocks()
     with gr.Tabs():
         with gr.TabItem("sovits 5.0"):
             gr.Markdown(value="""
+                最终版本，基于开源数据:Multi-Singer
                 https://github.com/Multi-Singer/Multi-Singer.github.io

configs/base.yaml CHANGED Viewed

@@ -7,24 +7,24 @@ train:
   lr_decay: 0.999875
   eps: 1e-9
   batch_size: 8
-  c_stft: 5
-  c_mel: 2.5
-  c_kl: 1.0
   port: 8001
   pretrain: ""
 #############################
 data:
   training_files: "files/train.txt"
   validation_files: "files/valid.txt"
-  segment_size: 12000  # WARNING: base on hop_length
   max_wav_value: 32768.0
-  sampling_rate: 48000
-  filter_length: 2048
-  hop_length: 480
-  win_length: 2048
-  mel_channels: 80
-  mel_fmin: 0.0
-  mel_fmax: 24000.0
 #############################
 vits:
   ppg_dim: 1024
@@ -36,9 +36,9 @@ vits:
 #############################
 gen:
   upsample_input: 192
-  upsample_rates: [6,5,4,2,2]
-  upsample_kernel_sizes: [20,15,8,4,4]
-  upsample_initial_channel: 256
   resblock_kernel_sizes: [3,7,11]
   resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
 #############################
@@ -50,13 +50,13 @@ mpd:
   lReLU_slope: 0.2
 #############################
 mrd:
-  resolutions: "[(1024, 120, 600), (2048, 240, 1200), (512, 50, 240)]" # (filter_length, hop_length, win_length)
   use_spectral_norm: False
   lReLU_slope: 0.2
 #############################
 log:
   info_interval: 100
-  eval_interval: 5
   save_interval: 5
   num_audio: 6
   pth_dir: 'chkpt'

   lr_decay: 0.999875
   eps: 1e-9
   batch_size: 8
+  c_stft: 9
+  c_mel: 1.
+  c_kl: 0.2
   port: 8001
   pretrain: ""
 #############################
 data:
   training_files: "files/train.txt"
   validation_files: "files/valid.txt"
+  segment_size: 8000  # WARNING: base on hop_length
   max_wav_value: 32768.0
+  sampling_rate: 32000
+  filter_length: 1024
+  hop_length: 320
+  win_length: 1024
+  mel_channels: 100
+  mel_fmin: 50.0
+  mel_fmax: 16000.0
 #############################
 vits:
   ppg_dim: 1024
 #############################
 gen:
   upsample_input: 192
+  upsample_rates: [5,4,4,2,2]
+  upsample_kernel_sizes: [15,8,8,4,4]
+  upsample_initial_channel: 320
   resblock_kernel_sizes: [3,7,11]
   resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
 #############################
   lReLU_slope: 0.2
 #############################
 mrd:
+  resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length)
   use_spectral_norm: False
   lReLU_slope: 0.2
 #############################
 log:
   info_interval: 100
+  eval_interval: 1
   save_interval: 5
   num_audio: 6
   pth_dir: 'chkpt'

vits/data_utils.py CHANGED Viewed

@@ -27,8 +27,8 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
     def _filter(self):
         lengths = []
         items_new = []
-        items_min = int(self.segment_size / self.hop_length * 2)  # 1 S
-        items_max = int(self.segment_size / self.hop_length * 9)  # 4.5 S
         for wavpath, spec, pitch, ppg, spk in self.items:
             if not os.path.isfile(wavpath):
                 continue
@@ -87,7 +87,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
         spk = torch.FloatTensor(spk)
         len_pit = pit.size()[0]
-        len_ppg = ppg.size()[0]
         len_min = min(len_pit, len_ppg)
         len_wav = len_min * self.hop_length
@@ -255,6 +255,8 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
         for i in range(len(self.buckets)):
             bucket = self.buckets[i]
             len_bucket = len(bucket)
             ids_bucket = indices[i]
             num_samples_bucket = self.num_samples_per_bucket[i]

     def _filter(self):
         lengths = []
         items_new = []
+        items_min = int(self.segment_size / self.hop_length * 4)  # 1 S
+        items_max = int(self.segment_size / self.hop_length * 16)  # 4 S
         for wavpath, spec, pitch, ppg, spk in self.items:
             if not os.path.isfile(wavpath):
                 continue
         spk = torch.FloatTensor(spk)
         len_pit = pit.size()[0]
+        len_ppg = ppg.size()[0] - 2 # for safe
         len_min = min(len_pit, len_ppg)
         len_wav = len_min * self.hop_length
         for i in range(len(self.buckets)):
             bucket = self.buckets[i]
             len_bucket = len(bucket)
+            if (len_bucket == 0):
+                continue
             ids_bucket = indices[i]
             num_samples_bucket = self.num_samples_per_bucket[i]

vits/models.py CHANGED Viewed

@@ -8,6 +8,7 @@ from vits import commons
 from vits import modules
 from vits.utils import f0_to_coarse
 from vits_decoder.generator import Generator
 class TextEncoder(nn.Module):
@@ -44,7 +45,7 @@ class TextEncoder(nn.Module):
         stats = self.proj(x) * x_mask
         m, logs = torch.split(stats, self.out_channels, dim=1)
         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
 class ResidualCouplingBlock(nn.Module):
@@ -151,6 +152,10 @@ class SynthesizerTrn(nn.Module):
             3,
             0.1,
         )
         self.enc_q = PosteriorEncoder(
             spec_channels,
             hp.vits.inter_channels,
@@ -171,8 +176,9 @@ class SynthesizerTrn(nn.Module):
         self.dec = Generator(hp=hp)
     def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
         g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
-        z_p, m_p, logs_p, ppg_mask = self.enc_p(
             ppg, ppg_l, f0=f0_to_coarse(pit))
         z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
@@ -183,10 +189,13 @@ class SynthesizerTrn(nn.Module):
         # SNAC to flow
         z_f, logdet_f = self.flow(z_q, spec_mask, g=spk)
         z_r, logdet_r = self.flow(z_p, spec_mask, g=spk, reverse=True)
-        return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r)
     def infer(self, ppg, pit, spk, ppg_l):
-        z_p, m_p, logs_p, ppg_mask = self.enc_p(
             ppg, ppg_l, f0=f0_to_coarse(pit))
         z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
         o = self.dec(spk, z * ppg_mask, f0=pit)
@@ -233,7 +242,7 @@ class SynthesizerInfer(nn.Module):
         return self.dec.source2wav(source)
     def inference(self, ppg, pit, spk, ppg_l, source):
-        z_p, m_p, logs_p, ppg_mask = self.enc_p(
             ppg, ppg_l, f0=f0_to_coarse(pit))
         z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
         o = self.dec.inference(spk, z * ppg_mask, source)

 from vits import modules
 from vits.utils import f0_to_coarse
 from vits_decoder.generator import Generator
+from vits.modules_grl import SpeakerClassifier
 class TextEncoder(nn.Module):
         stats = self.proj(x) * x_mask
         m, logs = torch.split(stats, self.out_channels, dim=1)
         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask, x
 class ResidualCouplingBlock(nn.Module):
             3,
             0.1,
         )
+        self.speaker_classifier = SpeakerClassifier(
+            hp.vits.hidden_channels,
+            hp.vits.spk_dim,
+        )
         self.enc_q = PosteriorEncoder(
             spec_channels,
             hp.vits.inter_channels,
         self.dec = Generator(hp=hp)
     def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
+        ppg = ppg + torch.randn_like(ppg)  # Perturbation
         g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
+        z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
             ppg, ppg_l, f0=f0_to_coarse(pit))
         z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
         # SNAC to flow
         z_f, logdet_f = self.flow(z_q, spec_mask, g=spk)
         z_r, logdet_r = self.flow(z_p, spec_mask, g=spk, reverse=True)
+        # speaker
+        spk_preds = self.speaker_classifier(x)
+        return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
     def infer(self, ppg, pit, spk, ppg_l):
+        ppg = ppg + torch.randn_like(ppg) * 0.0001  # Perturbation
+        z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
             ppg, ppg_l, f0=f0_to_coarse(pit))
         z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
         o = self.dec(spk, z * ppg_mask, f0=pit)
         return self.dec.source2wav(source)
     def inference(self, ppg, pit, spk, ppg_l, source):
+        z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
             ppg, ppg_l, f0=f0_to_coarse(pit))
         z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
         o = self.dec.inference(spk, z * ppg_mask, source)

vits/modules.py CHANGED Viewed

@@ -1,16 +1,7 @@
-import copy
-import math
-import numpy as np
-import scipy
 import torch
 from torch import nn
 from torch.nn import functional as F
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm
 from vits import commons
-from vits.commons import init_weights, get_padding
 LRELU_SLOPE = 0.1
@@ -220,148 +211,6 @@ class WN(torch.nn.Module):
             torch.nn.utils.remove_weight_norm(l)
-class ResBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
-        self.convs1 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-        self.convs1.apply(init_weights)
-        self.convs2 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-        self.convs2.apply(init_weights)
-    def forward(self, x, x_mask=None):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            if x_mask is not None:
-                xt = xt * x_mask
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            if x_mask is not None:
-                xt = xt * x_mask
-            xt = c2(xt)
-            x = xt + x
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_weight_norm(l)
-        for l in self.convs2:
-            remove_weight_norm(l)
-class ResBlock2(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-        super(ResBlock2, self).__init__()
-        self.convs = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-            ]
-        )
-        self.convs.apply(init_weights)
-    def forward(self, x, x_mask=None):
-        for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            if x_mask is not None:
-                xt = xt * x_mask
-            xt = c(xt)
-            x = xt + x
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs:
-            remove_weight_norm(l)
 class Log(nn.Module):
     def forward(self, x, x_mask, reverse=False, **kwargs):
         if not reverse:

 import torch
 from torch import nn
 from torch.nn import functional as F
 from vits import commons
 LRELU_SLOPE = 0.1
             torch.nn.utils.remove_weight_norm(l)
 class Log(nn.Module):
     def forward(self, x, x_mask, reverse=False, **kwargs):
         if not reverse:

vits/modules_grl.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Adapted from https://github.com/ubisoft/ubisoft-laforge-daft-exprt Apache License Version 2.0
+# Unsupervised Domain Adaptation by Backpropagation
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.nn.utils import weight_norm
+class GradientReversalFunction(Function):
+    @staticmethod
+    def forward(ctx, x, lambda_):
+        ctx.lambda_ = lambda_
+        return x.clone()
+    @staticmethod
+    def backward(ctx, grads):
+        lambda_ = ctx.lambda_
+        lambda_ = grads.new_tensor(lambda_)
+        dx = -lambda_ * grads
+        return dx, None
+class GradientReversal(torch.nn.Module):
+    ''' Gradient Reversal Layer
+            Y. Ganin, V. Lempitsky,
+            "Unsupervised Domain Adaptation by Backpropagation",
+            in ICML, 2015.
+        Forward pass is the identity function
+        In the backward pass, upstream gradients are multiplied by -lambda (i.e. gradient are reversed)
+    '''
+    def __init__(self, lambda_reversal=1):
+        super(GradientReversal, self).__init__()
+        self.lambda_ = lambda_reversal
+    def forward(self, x):
+        return GradientReversalFunction.apply(x, self.lambda_)
+class SpeakerClassifier(nn.Module):
+    def __init__(self, embed_dim, spk_dim):
+        super(SpeakerClassifier, self).__init__()
+        self.classifier = nn.Sequential(
+            GradientReversal(lambda_reversal=1),
+            weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
+            nn.ReLU(),
+            weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
+            nn.ReLU(),
+            weight_norm(nn.Conv1d(embed_dim, spk_dim, kernel_size=5, padding=2))
+        )
+    def forward(self, x):
+        ''' Forward function of Speaker Classifier:
+            x = (B, embed_dim, len)
+        '''
+        # pass through classifier
+        outputs = self.classifier(x)  # (B, nb_speakers)
+        outputs = torch.mean(outputs, dim=-1)
+        return outputs

vits/utils.py CHANGED Viewed

@@ -1,10 +1,6 @@
-import os
-import argparse
-import numpy as np
 import torch
 from scipy.io.wavfile import read
-from omegaconf import OmegaConf
 MATPLOTLIB_FLAG = False
@@ -35,18 +31,3 @@ def f0_to_coarse(f0):
     assert f0_coarse.max() <= 255 and f0_coarse.min(
     ) >= 1, (f0_coarse.max(), f0_coarse.min())
     return f0_coarse
-def get_hparams(init=True):
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-c', '--config', type=str, default="./configs/base.yaml",
-                        help='YAML file for configuration')
-    args = parser.parse_args()
-    hparams = OmegaConf.load(args.config)
-    model_dir = os.path.join("./logs", hparams.train.model)
-    if not os.path.exists(model_dir):
-        os.makedirs(model_dir)
-    config_save_path = os.path.join(model_dir, "config.json")
-    os.system(f"cp {args.config} {config_save_path}")
-    hparams.model_dir = model_dir
-    return hparams

 import torch
+import numpy as np
 from scipy.io.wavfile import read
 MATPLOTLIB_FLAG = False
     assert f0_coarse.max() <= 255 and f0_coarse.min(
     ) >= 1, (f0_coarse.max(), f0_coarse.min())
     return f0_coarse

vits_decoder/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ from .alias.act import SnakeAlias

vits_decoder/alias/act.py CHANGED Viewed

@@ -1,7 +1,12 @@
 # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 #   LICENSE is in incl_licenses directory.
 import torch.nn as nn
 from .resample import UpSample1d, DownSample1d
@@ -19,6 +24,102 @@ class Activation1d(nn.Module):
         self.upsample = UpSample1d(up_ratio, up_kernel_size)
         self.downsample = DownSample1d(down_ratio, down_kernel_size)
     # x: [B,C,T]
     def forward(self, x):
         x = self.upsample(x)

 # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 #   LICENSE is in incl_licenses directory.
+import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from torch import sin, pow
+from torch.nn import Parameter
 from .resample import UpSample1d, DownSample1d
         self.upsample = UpSample1d(up_ratio, up_kernel_size)
         self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta = x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(
+            0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class Mish(nn.Module):
+    """
+    Mish activation function is proposed in "Mish: A Self
+    Regularized Non-Monotonic Neural Activation Function"
+    paper, https://arxiv.org/abs/1908.08681.
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x * torch.tanh(F.softplus(x))
+class SnakeAlias(nn.Module):
+    def __init__(self,
+                 channels,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = SnakeBeta(channels, alpha_logscale=True)
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
     # x: [B,C,T]
     def forward(self, x):
         x = self.upsample(x)

vits_decoder/alias/activations.py DELETED Viewed

File without changes

vits_decoder/bigv.py CHANGED Viewed

@@ -1,14 +1,9 @@
 import torch
-import torch.nn.functional as F
 import torch.nn as nn
-from torch import nn, sin, pow
-from torch.nn import Parameter
 from torch.nn import Conv1d
 from torch.nn.utils import weight_norm, remove_weight_norm
-from .alias import *
 def init_weights(m, mean=0.0, std=0.01):
@@ -21,69 +16,9 @@ def get_padding(kernel_size, dilation=1):
     return int((kernel_size*dilation - dilation)/2)
-class SnakeBeta(nn.Module):
-    '''
-    A modified Snake function which uses separate parameters for the magnitude of the periodic components
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter that controls frequency
-        - beta - trainable parameter that controls magnitude
-    References:
-        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snakebeta(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha - trainable parameter that controls frequency
-            - beta - trainable parameter that controls magnitude
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            beta is initialized to 1 by default, higher values = higher-magnitude.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(SnakeBeta, self).__init__()
-        self.in_features = in_features
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:  # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-            self.beta = Parameter(torch.zeros(in_features) * alpha)
-        else:  # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-            self.beta = Parameter(torch.ones(in_features) * alpha)
-        self.alpha.requires_grad = alpha_trainable
-        self.beta.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        SnakeBeta ∶= x + 1/b * sin^2 (xa)
-        '''
-        alpha = self.alpha.unsqueeze(
-            0).unsqueeze(-1)  # line up with x to [B, C, T]
-        beta = self.beta.unsqueeze(0).unsqueeze(-1)
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-            beta = torch.exp(beta)
-        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-        return x
 class AMPBlock(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
         super(AMPBlock, self).__init__()
-        self.h = h
         self.convs1 = nn.ModuleList([
             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
                                padding=get_padding(kernel_size, dilation[0]))),
@@ -109,9 +44,7 @@ class AMPBlock(torch.nn.Module):
         # periodic nonlinearity with snakebeta function and anti-aliasing
         self.activations = nn.ModuleList([
-            Activation1d(
-                activation=SnakeBeta(channels, alpha_logscale=True))
-            for _ in range(self.num_layers)
         ])
     def forward(self, x):

 import torch
 import torch.nn as nn
 from torch.nn import Conv1d
 from torch.nn.utils import weight_norm, remove_weight_norm
+from .alias.act import SnakeAlias
 def init_weights(m, mean=0.0, std=0.01):
     return int((kernel_size*dilation - dilation)/2)
 class AMPBlock(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
         super(AMPBlock, self).__init__()
         self.convs1 = nn.ModuleList([
             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
                                padding=get_padding(kernel_size, dilation[0]))),
         # periodic nonlinearity with snakebeta function and anti-aliasing
         self.activations = nn.ModuleList([
+            SnakeAlias(channels) for _ in range(self.num_layers)
         ])
     def forward(self, x):

vits_decoder/discriminator.py CHANGED Viewed

@@ -1,32 +1,39 @@
 import torch
 import torch.nn as nn
 from .mpd import MultiPeriodDiscriminator
 from .mrd import MultiResolutionDiscriminator
-from omegaconf import OmegaConf
 class Discriminator(nn.Module):
     def __init__(self, hp):
         super(Discriminator, self).__init__()
         self.MRD = MultiResolutionDiscriminator(hp)
         self.MPD = MultiPeriodDiscriminator(hp)
     def forward(self, x):
-        return self.MRD(x), self.MPD(x)
 if __name__ == '__main__':
-    hp = OmegaConf.load('../config/default.yaml')
     model = Discriminator(hp)
     x = torch.randn(3, 1, 16384)
     print(x.shape)
-    mrd_output, mpd_output = model(x)
-    for features, score in mpd_output:
         for feat in features:
             print(feat.shape)
         print(score.shape)
-    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
     print(pytorch_total_params)

 import torch
 import torch.nn as nn
+from omegaconf import OmegaConf
+from .msd import ScaleDiscriminator
 from .mpd import MultiPeriodDiscriminator
 from .mrd import MultiResolutionDiscriminator
 class Discriminator(nn.Module):
     def __init__(self, hp):
         super(Discriminator, self).__init__()
         self.MRD = MultiResolutionDiscriminator(hp)
         self.MPD = MultiPeriodDiscriminator(hp)
+        self.MSD = ScaleDiscriminator()
     def forward(self, x):
+        r = self.MRD(x)
+        p = self.MPD(x)
+        s = self.MSD(x)
+        return r + p + s
 if __name__ == '__main__':
+    hp = OmegaConf.load('../config/base.yaml')
     model = Discriminator(hp)
     x = torch.randn(3, 1, 16384)
     print(x.shape)
+    output = model(x)
+    for features, score in output:
         for feat in features:
             print(feat.shape)
         print(score.shape)
+    pytorch_total_params = sum(p.numel()
+                               for p in model.parameters() if p.requires_grad)
     print(pytorch_total_params)

vits_decoder/generator.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
 import numpy as np
 from torch.nn import Conv1d
@@ -8,8 +9,7 @@ from torch.nn.utils import weight_norm
 from torch.nn.utils import remove_weight_norm
 from .nsf import SourceModuleHnNSF
-from .bigv import init_weights, SnakeBeta, AMPBlock
-from .alias import Activation1d
 class SpeakerAdapter(nn.Module):
@@ -57,24 +57,28 @@ class Generator(torch.nn.Module):
         # speaker adaper, 256 should change by what speaker encoder you use
         self.adapter = SpeakerAdapter(hp.vits.spk_dim, hp.gen.upsample_input)
         # pre conv
-        self.conv_pre = nn.utils.weight_norm(
-            Conv1d(hp.gen.upsample_input, hp.gen.upsample_initial_channel, 7, 1, padding=3))
         # nsf
         self.f0_upsamp = torch.nn.Upsample(
             scale_factor=np.prod(hp.gen.upsample_rates))
-        self.m_source = SourceModuleHnNSF()
         self.noise_convs = nn.ModuleList()
         # transposed conv-based upsamplers. does not apply anti-aliasing
         self.ups = nn.ModuleList()
         for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)):
             # print(f'ups: {i} {k}, {u}, {(k - u) // 2}')
             # base
-            self.ups.append(nn.ModuleList([
-                weight_norm(ConvTranspose1d(hp.gen.upsample_initial_channel // (2 ** i),
-                                            hp.gen.upsample_initial_channel // (
-                                                2 ** (i + 1)),
-                                            k, u, padding=(k - u) // 2))
-            ]))
             # nsf
             if i + 1 < len(hp.gen.upsample_rates):
                 stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:])
@@ -99,32 +103,30 @@ class Generator(torch.nn.Module):
         for i in range(len(self.ups)):
             ch = hp.gen.upsample_initial_channel // (2 ** (i + 1))
             for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes):
-                self.resblocks.append(AMPBlock(hp, ch, k, d))
         # post conv
-        activation_post = SnakeBeta(ch, alpha_logscale=True)
-        self.activation_post = Activation1d(activation=activation_post)
-        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
         # weight initialization
-        for i in range(len(self.ups)):
-            self.ups[i].apply(init_weights)
-        self.conv_post.apply(init_weights)
     def forward(self, spk, x, f0):
         # adapter
         x = self.adapter(x, spk)
         # nsf
         f0 = f0[:, None]
         f0 = self.f0_upsamp(f0).transpose(1, 2)
         har_source = self.m_source(f0)
         har_source = har_source.transpose(1, 2)
-        x = self.conv_pre(x)
         for i in range(self.num_upsamples):
             # upsampling
-            for i_up in range(len(self.ups[i])):
-                x = self.ups[i][i_up](x)
             # nsf
             x_source = self.noise_convs[i](har_source)
             x = x + x_source
@@ -145,12 +147,9 @@ class Generator(torch.nn.Module):
     def remove_weight_norm(self):
         for l in self.ups:
-            for l_i in l:
-                remove_weight_norm(l_i)
         for l in self.resblocks:
             l.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
     def eval(self, inference=False):
         super(Generator, self).eval()
@@ -177,11 +176,11 @@ class Generator(torch.nn.Module):
         # adapter
         x = self.adapter(x, spk)
         x = self.conv_pre(x)
         for i in range(self.num_upsamples):
             # upsampling
-            for i_up in range(len(self.ups[i])):
-                x = self.ups[i][i_up](x)
             # nsf
             x_source = self.noise_convs[i](har_source)
             x = x + x_source

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import numpy as np
 from torch.nn import Conv1d
 from torch.nn.utils import remove_weight_norm
 from .nsf import SourceModuleHnNSF
+from .bigv import init_weights, AMPBlock, SnakeAlias
 class SpeakerAdapter(nn.Module):
         # speaker adaper, 256 should change by what speaker encoder you use
         self.adapter = SpeakerAdapter(hp.vits.spk_dim, hp.gen.upsample_input)
         # pre conv
+        self.conv_pre = Conv1d(hp.gen.upsample_input,
+                               hp.gen.upsample_initial_channel, 7, 1, padding=3)
         # nsf
         self.f0_upsamp = torch.nn.Upsample(
             scale_factor=np.prod(hp.gen.upsample_rates))
+        self.m_source = SourceModuleHnNSF(sampling_rate=hp.data.sampling_rate)
         self.noise_convs = nn.ModuleList()
         # transposed conv-based upsamplers. does not apply anti-aliasing
         self.ups = nn.ModuleList()
         for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)):
             # print(f'ups: {i} {k}, {u}, {(k - u) // 2}')
             # base
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        hp.gen.upsample_initial_channel // (2 ** i),
+                        hp.gen.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2)
+                )
+            )
             # nsf
             if i + 1 < len(hp.gen.upsample_rates):
                 stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:])
         for i in range(len(self.ups)):
             ch = hp.gen.upsample_initial_channel // (2 ** (i + 1))
             for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes):
+                self.resblocks.append(AMPBlock(ch, k, d))
         # post conv
+        self.activation_post = SnakeAlias(ch)
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
         # weight initialization
+        self.ups.apply(init_weights)
     def forward(self, spk, x, f0):
+        # Perturbation
+        x = x + torch.randn_like(x)
         # adapter
         x = self.adapter(x, spk)
+        x = self.conv_pre(x)
+        x = x * torch.tanh(F.softplus(x))
         # nsf
         f0 = f0[:, None]
         f0 = self.f0_upsamp(f0).transpose(1, 2)
         har_source = self.m_source(f0)
         har_source = har_source.transpose(1, 2)
         for i in range(self.num_upsamples):
             # upsampling
+            x = self.ups[i](x)
             # nsf
             x_source = self.noise_convs[i](har_source)
             x = x + x_source
     def remove_weight_norm(self):
         for l in self.ups:
+            remove_weight_norm(l)
         for l in self.resblocks:
             l.remove_weight_norm()
     def eval(self, inference=False):
         super(Generator, self).eval()
         # adapter
         x = self.adapter(x, spk)
         x = self.conv_pre(x)
+        x = x * torch.tanh(F.softplus(x))
         for i in range(self.num_upsamples):
             # upsampling
+            x = self.ups[i](x)
             # nsf
             x_source = self.noise_convs[i](har_source)
             x = x + x_source

vits_decoder/med.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torchaudio
+import typing as T
+class MelspecDiscriminator(torch.nn.Module):
+    """mel spectrogram (frequency domain) discriminator"""
+    def __init__(self) -> None:
+        super().__init__()
+        self.SAMPLE_RATE = 48000
+        # mel filterbank transform
+        self._melspec = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self.SAMPLE_RATE,
+            n_fft=2048,
+            win_length=int(0.025 * self.SAMPLE_RATE),
+            hop_length=int(0.010 * self.SAMPLE_RATE),
+            n_mels=128,
+            power=1,
+        )
+        # time-frequency 2D convolutions
+        kernel_sizes = [(7, 7), (4, 4), (4, 4), (4, 4)]
+        strides = [(1, 2), (1, 2), (1, 2), (1, 2)]
+        self._convs = torch.nn.ModuleList(
+            [
+                torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_channels=1 if i == 0 else 32,
+                        out_channels=64,
+                        kernel_size=k,
+                        stride=s,
+                        padding=(1, 2),
+                        bias=False,
+                    ),
+                    torch.nn.BatchNorm2d(num_features=64),
+                    torch.nn.GLU(dim=1),
+                )
+                for i, (k, s) in enumerate(zip(kernel_sizes, strides))
+            ]
+        )
+        # output adversarial projection
+        self._postnet = torch.nn.Conv2d(
+            in_channels=32,
+            out_channels=1,
+            kernel_size=(15, 3),
+            stride=(1, 2),
+        )
+    def forward(self, x: torch.Tensor) -> T.Tuple[torch.Tensor, T.List[torch.Tensor]]:
+        # apply the log-scale mel spectrogram transform
+        x = torch.log(self._melspec(x) + 1e-5)
+        # compute hidden layers and feature maps
+        f = []
+        for c in self._convs:
+            x = c(x)
+            f.append(x)
+        # apply the output projection and global average pooling
+        x = self._postnet(x)
+        x = x.mean(dim=[-2, -1])
+        return [(f, x)]

vits_decoder/msd.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+class ScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(ScaleDiscriminator, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(nn.Conv1d(1, 16, 15, 1, padding=7)),
+            weight_norm(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            weight_norm(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            weight_norm(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            weight_norm(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            weight_norm(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = weight_norm(nn.Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, 0.1)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return [(fmap, x)]

vits_decoder/nsf.py CHANGED Viewed

@@ -356,34 +356,15 @@ class SourceModuleCycNoise_v1(torch.nn.Module):
 class SourceModuleHnNSF(torch.nn.Module):
-    """SourceModule for hn-nsf
-    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0)
-    sampling_rate: sampling_rate in Hz
-    harmonic_num: number of harmonic above F0 (default: 0)
-    sine_amp: amplitude of sine source signal (default: 0.1)
-    add_noise_std: std of additive Gaussian noise (default: 0.003)
-        note that amplitude of noise in unvoiced is decided
-        by sine_amp
-    voiced_threshold: threhold to set U/V given F0 (default: 0)
-    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
-    F0_sampled (batchsize, length, 1)
-    Sine_source (batchsize, length, 1)
-    noise_source (batchsize, length 1)
-    uv (batchsize, length, 1)
-    """
     def __init__(
         self,
-        sampling_rate=48000,
-        harmonic_num=10,
         sine_amp=0.1,
         add_noise_std=0.003,
         voiced_threshod=0,
     ):
         super(SourceModuleHnNSF, self).__init__()
         self.sine_amp = sine_amp
         self.noise_std = add_noise_std
@@ -393,17 +374,21 @@ class SourceModuleHnNSF(torch.nn.Module):
         )
         # to merge source harmonics into a single excitation
-        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
         self.l_tanh = torch.nn.Tanh()
     def forward(self, x):
         """
-        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
         F0_sampled (batchsize, length, 1)
         Sine_source (batchsize, length, 1)
-        noise_source (batchsize, length 1)
         """
         # source for harmonic branch
         sine_wavs = self.l_sin_gen(x)
-        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
         return sine_merge

 class SourceModuleHnNSF(torch.nn.Module):
     def __init__(
         self,
+        sampling_rate=32000,
         sine_amp=0.1,
         add_noise_std=0.003,
         voiced_threshod=0,
     ):
         super(SourceModuleHnNSF, self).__init__()
+        harmonic_num = 10
         self.sine_amp = sine_amp
         self.noise_std = add_noise_std
         )
         # to merge source harmonics into a single excitation
         self.l_tanh = torch.nn.Tanh()
+        self.register_buffer('merge_w', torch.FloatTensor([[
+            0.2942, -0.2243, 0.0033, -0.0056, -0.0020, -0.0046,
+            0.0221, -0.0083, -0.0241, -0.0036, -0.0581]]))
+        self.register_buffer('merge_b', torch.FloatTensor([0.0008]))
     def forward(self, x):
         """
+        Sine_source = SourceModuleHnNSF(F0_sampled)
         F0_sampled (batchsize, length, 1)
         Sine_source (batchsize, length, 1)
         """
         # source for harmonic branch
         sine_wavs = self.l_sin_gen(x)
+        sine_wavs = torch_nn_func.linear(
+            sine_wavs, self.merge_w) + self.merge_b
+        sine_merge = self.l_tanh(sine_wavs)
         return sine_merge

vits_pretrain/{sovits5.0-48k-debug.pth → sovits5.0_bigvgan.pth} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2c3e49cebb2968266659507c80a007375f49d616ee216cf084cb3f87e93083d
-size 67866609

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffed3845044b8bef076d72272da19791e1344ad3b750a02d6e4980acf6cb0a0b
+size 74825605