Spaces:
Runtime error
Runtime error
Commit
·
755994c
1
Parent(s):
c24b656
mix
Browse files- app.py +19 -4
- configs/base.yaml +2 -1
- vits/data_utils.py +27 -10
- vits/models.py +16 -9
- vits_pretrain/{sovits5.0_bigvgan.pth → sovits5.0_bigvgan_mix.pth} +2 -2
- whisper/inference.py +2 -1
app.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
from vits.models import SynthesizerInfer
|
| 2 |
from omegaconf import OmegaConf
|
| 3 |
import torchcrepe
|
|
@@ -72,7 +74,7 @@ model = SynthesizerInfer(
|
|
| 72 |
hp.data.filter_length // 2 + 1,
|
| 73 |
hp.data.segment_size // hp.data.hop_length,
|
| 74 |
hp)
|
| 75 |
-
load_svc_model("vits_pretrain/sovits5.
|
| 76 |
model.eval()
|
| 77 |
model.to(device)
|
| 78 |
|
|
@@ -81,6 +83,8 @@ def svc_change(argswave, argsspk):
|
|
| 81 |
|
| 82 |
argsppg = "svc_tmp.ppg.npy"
|
| 83 |
os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
|
|
|
|
|
|
|
| 84 |
|
| 85 |
spk = np.load(argsspk)
|
| 86 |
spk = torch.FloatTensor(spk)
|
|
@@ -89,13 +93,20 @@ def svc_change(argswave, argsspk):
|
|
| 89 |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
|
| 90 |
ppg = torch.FloatTensor(ppg)
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
pit = compute_f0_nn(argswave, device)
|
| 93 |
pit = torch.FloatTensor(pit)
|
| 94 |
|
| 95 |
len_pit = pit.size()[0]
|
|
|
|
| 96 |
len_ppg = ppg.size()[0]
|
| 97 |
-
len_min = min(len_pit,
|
|
|
|
| 98 |
pit = pit[:len_min]
|
|
|
|
| 99 |
ppg = ppg[:len_min, :]
|
| 100 |
|
| 101 |
with torch.no_grad():
|
|
@@ -129,11 +140,12 @@ def svc_change(argswave, argsspk):
|
|
| 129 |
cut_e_out = -1 * hop_frame * hop_size
|
| 130 |
|
| 131 |
sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
|
|
|
|
| 132 |
sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
|
| 133 |
sub_len = torch.LongTensor([cut_e - cut_s]).to(device)
|
| 134 |
sub_har = source[:, :, cut_s *
|
| 135 |
hop_size:cut_e * hop_size].to(device)
|
| 136 |
-
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
|
| 137 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
| 138 |
|
| 139 |
sub_out = sub_out[cut_s_out:cut_e_out]
|
|
@@ -148,10 +160,11 @@ def svc_change(argswave, argsspk):
|
|
| 148 |
cut_s = 0
|
| 149 |
cut_s_out = 0
|
| 150 |
sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
|
|
|
|
| 151 |
sub_pit = pit[cut_s:].unsqueeze(0).to(device)
|
| 152 |
sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
|
| 153 |
sub_har = source[:, :, cut_s * hop_size:].to(device)
|
| 154 |
-
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
|
| 155 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
| 156 |
|
| 157 |
sub_out = sub_out[cut_s_out:]
|
|
@@ -187,6 +200,8 @@ with app:
|
|
| 187 |
|
| 188 |
https://github.com/Multi-Singer/Multi-Singer.github.io
|
| 189 |
|
|
|
|
|
|
|
| 190 |
[轻度伴奏可以无需去伴奏]就能直接进行歌声转换的SVC库
|
| 191 |
""")
|
| 192 |
sid = gr.Dropdown(label="音色", choices=[
|
|
|
|
| 1 |
+
import sys,os
|
| 2 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 3 |
from vits.models import SynthesizerInfer
|
| 4 |
from omegaconf import OmegaConf
|
| 5 |
import torchcrepe
|
|
|
|
| 74 |
hp.data.filter_length // 2 + 1,
|
| 75 |
hp.data.segment_size // hp.data.hop_length,
|
| 76 |
hp)
|
| 77 |
+
load_svc_model("vits_pretrain/sovits5.0_bigvgan_mix.pth", model)
|
| 78 |
model.eval()
|
| 79 |
model.to(device)
|
| 80 |
|
|
|
|
| 83 |
|
| 84 |
argsppg = "svc_tmp.ppg.npy"
|
| 85 |
os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
|
| 86 |
+
argsvec = "svc_tmp.vec.npy"
|
| 87 |
+
os.system(f"python hubert/inference.py -w {argswave} -v {argsvec}")
|
| 88 |
|
| 89 |
spk = np.load(argsspk)
|
| 90 |
spk = torch.FloatTensor(spk)
|
|
|
|
| 93 |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
|
| 94 |
ppg = torch.FloatTensor(ppg)
|
| 95 |
|
| 96 |
+
vec = np.load(argsvec)
|
| 97 |
+
vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2
|
| 98 |
+
vec = torch.FloatTensor(vec)
|
| 99 |
+
|
| 100 |
pit = compute_f0_nn(argswave, device)
|
| 101 |
pit = torch.FloatTensor(pit)
|
| 102 |
|
| 103 |
len_pit = pit.size()[0]
|
| 104 |
+
len_vec = vec.size()[0]
|
| 105 |
len_ppg = ppg.size()[0]
|
| 106 |
+
len_min = min(len_pit, len_vec)
|
| 107 |
+
len_min = min(len_min, len_ppg)
|
| 108 |
pit = pit[:len_min]
|
| 109 |
+
vec = vec[:len_min, :]
|
| 110 |
ppg = ppg[:len_min, :]
|
| 111 |
|
| 112 |
with torch.no_grad():
|
|
|
|
| 140 |
cut_e_out = -1 * hop_frame * hop_size
|
| 141 |
|
| 142 |
sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
|
| 143 |
+
sub_vec = vec[cut_s:cut_e, :].unsqueeze(0).to(device)
|
| 144 |
sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
|
| 145 |
sub_len = torch.LongTensor([cut_e - cut_s]).to(device)
|
| 146 |
sub_har = source[:, :, cut_s *
|
| 147 |
hop_size:cut_e * hop_size].to(device)
|
| 148 |
+
sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har)
|
| 149 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
| 150 |
|
| 151 |
sub_out = sub_out[cut_s_out:cut_e_out]
|
|
|
|
| 160 |
cut_s = 0
|
| 161 |
cut_s_out = 0
|
| 162 |
sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
|
| 163 |
+
sub_vec = vec[cut_s:, :].unsqueeze(0).to(device)
|
| 164 |
sub_pit = pit[cut_s:].unsqueeze(0).to(device)
|
| 165 |
sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
|
| 166 |
sub_har = source[:, :, cut_s * hop_size:].to(device)
|
| 167 |
+
sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har)
|
| 168 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
| 169 |
|
| 170 |
sub_out = sub_out[cut_s_out:]
|
|
|
|
| 200 |
|
| 201 |
https://github.com/Multi-Singer/Multi-Singer.github.io
|
| 202 |
|
| 203 |
+
mix_encoder: whisper + hubert, 提升跨语言能力和纯对白语音训练的效果
|
| 204 |
+
|
| 205 |
[轻度伴奏可以无需去伴奏]就能直接进行歌声转换的SVC库
|
| 206 |
""")
|
| 207 |
sid = gr.Dropdown(label="音色", choices=[
|
configs/base.yaml
CHANGED
|
@@ -28,11 +28,12 @@ data:
|
|
| 28 |
#############################
|
| 29 |
vits:
|
| 30 |
ppg_dim: 1024
|
|
|
|
| 31 |
spk_dim: 256
|
| 32 |
gin_channels: 256
|
| 33 |
inter_channels: 192
|
| 34 |
hidden_channels: 192
|
| 35 |
-
filter_channels:
|
| 36 |
#############################
|
| 37 |
gen:
|
| 38 |
upsample_input: 192
|
|
|
|
| 28 |
#############################
|
| 29 |
vits:
|
| 30 |
ppg_dim: 1024
|
| 31 |
+
vec_dim: 256
|
| 32 |
spk_dim: 256
|
| 33 |
gin_channels: 256
|
| 34 |
inter_channels: 192
|
| 35 |
hidden_channels: 192
|
| 36 |
+
filter_channels: 640
|
| 37 |
#############################
|
| 38 |
gen:
|
| 39 |
upsample_input: 192
|
vits/data_utils.py
CHANGED
|
@@ -29,13 +29,15 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
| 29 |
items_new = []
|
| 30 |
items_min = int(self.segment_size / self.hop_length * 4) # 1 S
|
| 31 |
items_max = int(self.segment_size / self.hop_length * 16) # 4 S
|
| 32 |
-
for wavpath, spec, pitch, ppg, spk in self.items:
|
| 33 |
if not os.path.isfile(wavpath):
|
| 34 |
continue
|
| 35 |
if not os.path.isfile(spec):
|
| 36 |
continue
|
| 37 |
if not os.path.isfile(pitch):
|
| 38 |
continue
|
|
|
|
|
|
|
| 39 |
if not os.path.isfile(ppg):
|
| 40 |
continue
|
| 41 |
if not os.path.isfile(spk):
|
|
@@ -46,7 +48,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
| 46 |
continue
|
| 47 |
if (usel >= items_max):
|
| 48 |
usel = items_max
|
| 49 |
-
items_new.append([wavpath, spec, pitch, ppg, spk, usel])
|
| 50 |
lengths.append(usel)
|
| 51 |
self.items = items_new
|
| 52 |
self.lengths = lengths
|
|
@@ -70,28 +72,35 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
| 70 |
wav = item[0]
|
| 71 |
spe = item[1]
|
| 72 |
pit = item[2]
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
| 76 |
|
| 77 |
wav = self.read_wav(wav)
|
| 78 |
spe = torch.load(spe)
|
| 79 |
|
| 80 |
pit = np.load(pit)
|
|
|
|
|
|
|
| 81 |
ppg = np.load(ppg)
|
| 82 |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
|
| 83 |
spk = np.load(spk)
|
| 84 |
|
| 85 |
pit = torch.FloatTensor(pit)
|
|
|
|
| 86 |
ppg = torch.FloatTensor(ppg)
|
| 87 |
spk = torch.FloatTensor(spk)
|
| 88 |
|
| 89 |
len_pit = pit.size()[0]
|
|
|
|
| 90 |
len_ppg = ppg.size()[0] - 2 # for safe
|
| 91 |
-
len_min = min(len_pit,
|
|
|
|
| 92 |
len_wav = len_min * self.hop_length
|
| 93 |
|
| 94 |
pit = pit[:len_min]
|
|
|
|
| 95 |
ppg = ppg[:len_min, :]
|
| 96 |
spe = spe[:, :len_min]
|
| 97 |
wav = wav[:, :len_wav]
|
|
@@ -101,6 +110,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
| 101 |
frame_end = frame_start + use
|
| 102 |
|
| 103 |
pit = pit[frame_start:frame_end]
|
|
|
|
| 104 |
ppg = ppg[frame_start:frame_end, :]
|
| 105 |
spe = spe[:, frame_start:frame_end]
|
| 106 |
|
|
@@ -112,7 +122,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
| 112 |
# print(ppg.shape)
|
| 113 |
# print(pit.shape)
|
| 114 |
# print(spk.shape)
|
| 115 |
-
return spe, wav, ppg, pit, spk
|
| 116 |
|
| 117 |
|
| 118 |
class TextAudioSpeakerCollate:
|
|
@@ -143,10 +153,13 @@ class TextAudioSpeakerCollate:
|
|
| 143 |
ppg_lengths = torch.FloatTensor(len(batch))
|
| 144 |
ppg_padded = torch.FloatTensor(
|
| 145 |
len(batch), max_ppg_len, batch[0][2].size(1))
|
|
|
|
|
|
|
| 146 |
pit_padded = torch.FloatTensor(len(batch), max_ppg_len)
|
| 147 |
ppg_padded.zero_()
|
|
|
|
| 148 |
pit_padded.zero_()
|
| 149 |
-
spk = torch.FloatTensor(len(batch), batch[0][
|
| 150 |
|
| 151 |
for i in range(len(ids_sorted_decreasing)):
|
| 152 |
row = batch[ids_sorted_decreasing[i]]
|
|
@@ -163,10 +176,13 @@ class TextAudioSpeakerCollate:
|
|
| 163 |
ppg_padded[i, : ppg.size(0), :] = ppg
|
| 164 |
ppg_lengths[i] = ppg.size(0)
|
| 165 |
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
| 167 |
pit_padded[i, : pit.size(0)] = pit
|
| 168 |
|
| 169 |
-
spk[i] = row[
|
| 170 |
# print(ppg_padded.shape)
|
| 171 |
# print(ppg_lengths.shape)
|
| 172 |
# print(pit_padded.shape)
|
|
@@ -178,6 +194,7 @@ class TextAudioSpeakerCollate:
|
|
| 178 |
return (
|
| 179 |
ppg_padded,
|
| 180 |
ppg_lengths,
|
|
|
|
| 181 |
pit_padded,
|
| 182 |
spk,
|
| 183 |
spe_padded,
|
|
|
|
| 29 |
items_new = []
|
| 30 |
items_min = int(self.segment_size / self.hop_length * 4) # 1 S
|
| 31 |
items_max = int(self.segment_size / self.hop_length * 16) # 4 S
|
| 32 |
+
for wavpath, spec, pitch, vec, ppg, spk in self.items:
|
| 33 |
if not os.path.isfile(wavpath):
|
| 34 |
continue
|
| 35 |
if not os.path.isfile(spec):
|
| 36 |
continue
|
| 37 |
if not os.path.isfile(pitch):
|
| 38 |
continue
|
| 39 |
+
if not os.path.isfile(vec):
|
| 40 |
+
continue
|
| 41 |
if not os.path.isfile(ppg):
|
| 42 |
continue
|
| 43 |
if not os.path.isfile(spk):
|
|
|
|
| 48 |
continue
|
| 49 |
if (usel >= items_max):
|
| 50 |
usel = items_max
|
| 51 |
+
items_new.append([wavpath, spec, pitch, vec, ppg, spk, usel])
|
| 52 |
lengths.append(usel)
|
| 53 |
self.items = items_new
|
| 54 |
self.lengths = lengths
|
|
|
|
| 72 |
wav = item[0]
|
| 73 |
spe = item[1]
|
| 74 |
pit = item[2]
|
| 75 |
+
vec = item[3]
|
| 76 |
+
ppg = item[4]
|
| 77 |
+
spk = item[5]
|
| 78 |
+
use = item[6]
|
| 79 |
|
| 80 |
wav = self.read_wav(wav)
|
| 81 |
spe = torch.load(spe)
|
| 82 |
|
| 83 |
pit = np.load(pit)
|
| 84 |
+
vec = np.load(vec)
|
| 85 |
+
vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2
|
| 86 |
ppg = np.load(ppg)
|
| 87 |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
|
| 88 |
spk = np.load(spk)
|
| 89 |
|
| 90 |
pit = torch.FloatTensor(pit)
|
| 91 |
+
vec = torch.FloatTensor(vec)
|
| 92 |
ppg = torch.FloatTensor(ppg)
|
| 93 |
spk = torch.FloatTensor(spk)
|
| 94 |
|
| 95 |
len_pit = pit.size()[0]
|
| 96 |
+
len_vec = vec.size()[0] - 2 # for safe
|
| 97 |
len_ppg = ppg.size()[0] - 2 # for safe
|
| 98 |
+
len_min = min(len_pit, len_vec)
|
| 99 |
+
len_min = min(len_min, len_ppg)
|
| 100 |
len_wav = len_min * self.hop_length
|
| 101 |
|
| 102 |
pit = pit[:len_min]
|
| 103 |
+
vec = vec[:len_min, :]
|
| 104 |
ppg = ppg[:len_min, :]
|
| 105 |
spe = spe[:, :len_min]
|
| 106 |
wav = wav[:, :len_wav]
|
|
|
|
| 110 |
frame_end = frame_start + use
|
| 111 |
|
| 112 |
pit = pit[frame_start:frame_end]
|
| 113 |
+
vec = vec[frame_start:frame_end, :]
|
| 114 |
ppg = ppg[frame_start:frame_end, :]
|
| 115 |
spe = spe[:, frame_start:frame_end]
|
| 116 |
|
|
|
|
| 122 |
# print(ppg.shape)
|
| 123 |
# print(pit.shape)
|
| 124 |
# print(spk.shape)
|
| 125 |
+
return spe, wav, ppg, vec, pit, spk
|
| 126 |
|
| 127 |
|
| 128 |
class TextAudioSpeakerCollate:
|
|
|
|
| 153 |
ppg_lengths = torch.FloatTensor(len(batch))
|
| 154 |
ppg_padded = torch.FloatTensor(
|
| 155 |
len(batch), max_ppg_len, batch[0][2].size(1))
|
| 156 |
+
vec_padded = torch.FloatTensor(
|
| 157 |
+
len(batch), max_ppg_len, batch[0][3].size(1))
|
| 158 |
pit_padded = torch.FloatTensor(len(batch), max_ppg_len)
|
| 159 |
ppg_padded.zero_()
|
| 160 |
+
vec_padded.zero_()
|
| 161 |
pit_padded.zero_()
|
| 162 |
+
spk = torch.FloatTensor(len(batch), batch[0][5].size(0))
|
| 163 |
|
| 164 |
for i in range(len(ids_sorted_decreasing)):
|
| 165 |
row = batch[ids_sorted_decreasing[i]]
|
|
|
|
| 176 |
ppg_padded[i, : ppg.size(0), :] = ppg
|
| 177 |
ppg_lengths[i] = ppg.size(0)
|
| 178 |
|
| 179 |
+
vec = row[3]
|
| 180 |
+
vec_padded[i, : vec.size(0), :] = vec
|
| 181 |
+
|
| 182 |
+
pit = row[4]
|
| 183 |
pit_padded[i, : pit.size(0)] = pit
|
| 184 |
|
| 185 |
+
spk[i] = row[5]
|
| 186 |
# print(ppg_padded.shape)
|
| 187 |
# print(ppg_lengths.shape)
|
| 188 |
# print(pit_padded.shape)
|
|
|
|
| 194 |
return (
|
| 195 |
ppg_padded,
|
| 196 |
ppg_lengths,
|
| 197 |
+
vec_padded,
|
| 198 |
pit_padded,
|
| 199 |
spk,
|
| 200 |
spe_padded,
|
vits/models.py
CHANGED
|
@@ -14,6 +14,7 @@ from vits.modules_grl import SpeakerClassifier
|
|
| 14 |
class TextEncoder(nn.Module):
|
| 15 |
def __init__(self,
|
| 16 |
in_channels,
|
|
|
|
| 17 |
out_channels,
|
| 18 |
hidden_channels,
|
| 19 |
filter_channels,
|
|
@@ -24,6 +25,7 @@ class TextEncoder(nn.Module):
|
|
| 24 |
super().__init__()
|
| 25 |
self.out_channels = out_channels
|
| 26 |
self.pre = nn.Conv1d(in_channels, hidden_channels, kernel_size=5, padding=2)
|
|
|
|
| 27 |
self.pit = nn.Embedding(256, hidden_channels)
|
| 28 |
self.enc = attentions.Encoder(
|
| 29 |
hidden_channels,
|
|
@@ -34,13 +36,15 @@ class TextEncoder(nn.Module):
|
|
| 34 |
p_dropout)
|
| 35 |
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
| 36 |
|
| 37 |
-
def forward(self, x, x_lengths, f0):
|
| 38 |
x = torch.transpose(x, 1, -1) # [b, h, t]
|
| 39 |
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
| 40 |
x.dtype
|
| 41 |
)
|
| 42 |
x = self.pre(x) * x_mask
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
x = self.enc(x * x_mask, x_mask)
|
| 45 |
stats = self.proj(x) * x_mask
|
| 46 |
m, logs = torch.split(stats, self.out_channels, dim=1)
|
|
@@ -144,6 +148,7 @@ class SynthesizerTrn(nn.Module):
|
|
| 144 |
self.emb_g = nn.Linear(hp.vits.spk_dim, hp.vits.gin_channels)
|
| 145 |
self.enc_p = TextEncoder(
|
| 146 |
hp.vits.ppg_dim,
|
|
|
|
| 147 |
hp.vits.inter_channels,
|
| 148 |
hp.vits.hidden_channels,
|
| 149 |
hp.vits.filter_channels,
|
|
@@ -175,11 +180,12 @@ class SynthesizerTrn(nn.Module):
|
|
| 175 |
)
|
| 176 |
self.dec = Generator(hp=hp)
|
| 177 |
|
| 178 |
-
def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
|
| 179 |
-
ppg = ppg + torch.randn_like(ppg) # Perturbation
|
|
|
|
| 180 |
g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
|
| 181 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
| 182 |
-
ppg, ppg_l, f0=f0_to_coarse(pit))
|
| 183 |
z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
|
| 184 |
|
| 185 |
z_slice, pit_slice, ids_slice = commons.rand_slice_segments_with_pitch(
|
|
@@ -193,10 +199,10 @@ class SynthesizerTrn(nn.Module):
|
|
| 193 |
spk_preds = self.speaker_classifier(x)
|
| 194 |
return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
|
| 195 |
|
| 196 |
-
def infer(self, ppg, pit, spk, ppg_l):
|
| 197 |
ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation
|
| 198 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
| 199 |
-
ppg, ppg_l, f0=f0_to_coarse(pit))
|
| 200 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
| 201 |
o = self.dec(spk, z * ppg_mask, f0=pit)
|
| 202 |
return o
|
|
@@ -213,6 +219,7 @@ class SynthesizerInfer(nn.Module):
|
|
| 213 |
self.segment_size = segment_size
|
| 214 |
self.enc_p = TextEncoder(
|
| 215 |
hp.vits.ppg_dim,
|
|
|
|
| 216 |
hp.vits.inter_channels,
|
| 217 |
hp.vits.hidden_channels,
|
| 218 |
hp.vits.filter_channels,
|
|
@@ -241,9 +248,9 @@ class SynthesizerInfer(nn.Module):
|
|
| 241 |
def source2wav(self, source):
|
| 242 |
return self.dec.source2wav(source)
|
| 243 |
|
| 244 |
-
def inference(self, ppg, pit, spk, ppg_l, source):
|
| 245 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
| 246 |
-
ppg, ppg_l, f0=f0_to_coarse(pit))
|
| 247 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
| 248 |
o = self.dec.inference(spk, z * ppg_mask, source)
|
| 249 |
return o
|
|
|
|
| 14 |
class TextEncoder(nn.Module):
|
| 15 |
def __init__(self,
|
| 16 |
in_channels,
|
| 17 |
+
vec_channels,
|
| 18 |
out_channels,
|
| 19 |
hidden_channels,
|
| 20 |
filter_channels,
|
|
|
|
| 25 |
super().__init__()
|
| 26 |
self.out_channels = out_channels
|
| 27 |
self.pre = nn.Conv1d(in_channels, hidden_channels, kernel_size=5, padding=2)
|
| 28 |
+
self.hub = nn.Conv1d(vec_channels, hidden_channels, kernel_size=5, padding=2)
|
| 29 |
self.pit = nn.Embedding(256, hidden_channels)
|
| 30 |
self.enc = attentions.Encoder(
|
| 31 |
hidden_channels,
|
|
|
|
| 36 |
p_dropout)
|
| 37 |
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
| 38 |
|
| 39 |
+
def forward(self, x, x_lengths, v, f0):
|
| 40 |
x = torch.transpose(x, 1, -1) # [b, h, t]
|
| 41 |
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
| 42 |
x.dtype
|
| 43 |
)
|
| 44 |
x = self.pre(x) * x_mask
|
| 45 |
+
v = torch.transpose(v, 1, -1) # [b, h, t]
|
| 46 |
+
v = self.hub(v) * x_mask
|
| 47 |
+
x = x + v + self.pit(f0).transpose(1, 2)
|
| 48 |
x = self.enc(x * x_mask, x_mask)
|
| 49 |
stats = self.proj(x) * x_mask
|
| 50 |
m, logs = torch.split(stats, self.out_channels, dim=1)
|
|
|
|
| 148 |
self.emb_g = nn.Linear(hp.vits.spk_dim, hp.vits.gin_channels)
|
| 149 |
self.enc_p = TextEncoder(
|
| 150 |
hp.vits.ppg_dim,
|
| 151 |
+
hp.vits.vec_dim,
|
| 152 |
hp.vits.inter_channels,
|
| 153 |
hp.vits.hidden_channels,
|
| 154 |
hp.vits.filter_channels,
|
|
|
|
| 180 |
)
|
| 181 |
self.dec = Generator(hp=hp)
|
| 182 |
|
| 183 |
+
def forward(self, ppg, vec, pit, spec, spk, ppg_l, spec_l):
|
| 184 |
+
ppg = ppg + torch.randn_like(ppg) * 1 # Perturbation
|
| 185 |
+
vec = vec + torch.randn_like(vec) * 2 # Perturbation
|
| 186 |
g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
|
| 187 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
| 188 |
+
ppg, ppg_l, vec, f0=f0_to_coarse(pit))
|
| 189 |
z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
|
| 190 |
|
| 191 |
z_slice, pit_slice, ids_slice = commons.rand_slice_segments_with_pitch(
|
|
|
|
| 199 |
spk_preds = self.speaker_classifier(x)
|
| 200 |
return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
|
| 201 |
|
| 202 |
+
def infer(self, ppg, vec, pit, spk, ppg_l):
|
| 203 |
ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation
|
| 204 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
| 205 |
+
ppg, ppg_l, vec, f0=f0_to_coarse(pit))
|
| 206 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
| 207 |
o = self.dec(spk, z * ppg_mask, f0=pit)
|
| 208 |
return o
|
|
|
|
| 219 |
self.segment_size = segment_size
|
| 220 |
self.enc_p = TextEncoder(
|
| 221 |
hp.vits.ppg_dim,
|
| 222 |
+
hp.vits.vec_dim,
|
| 223 |
hp.vits.inter_channels,
|
| 224 |
hp.vits.hidden_channels,
|
| 225 |
hp.vits.filter_channels,
|
|
|
|
| 248 |
def source2wav(self, source):
|
| 249 |
return self.dec.source2wav(source)
|
| 250 |
|
| 251 |
+
def inference(self, ppg, vec, pit, spk, ppg_l, source):
|
| 252 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
| 253 |
+
ppg, ppg_l, vec, f0=f0_to_coarse(pit))
|
| 254 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
| 255 |
o = self.dec.inference(spk, z * ppg_mask, source)
|
| 256 |
return o
|
vits_pretrain/{sovits5.0_bigvgan.pth → sovits5.0_bigvgan_mix.pth}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b941958b20d2eb91abdb6ff9d1344e056ec2c78116e4c3a1e2b23b022d32db1
|
| 3 |
+
size 79352005
|
whisper/inference.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
import os
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import argparse
|
| 4 |
import torch
|
|
|
|
| 1 |
+
import sys,os
|
| 2 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 3 |
import numpy as np
|
| 4 |
import argparse
|
| 5 |
import torch
|