OlaWod commited on
Commit
18be3e0
1 Parent(s): 4909891

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -1
  2. .gitignore +3 -0
  3. LICENSE +21 -0
  4. README.md +1 -0
  5. Utils/JDC/__init__.py +1 -0
  6. Utils/JDC/bst.t7.txt +1 -0
  7. Utils/JDC/model.py +190 -0
  8. Utils/__init__.py +1 -0
  9. app.py +142 -0
  10. config_v1_16k.json +42 -0
  11. dataset/audio/p225/p225_220.wav +0 -0
  12. dataset/audio/p226/p226_341.wav +0 -0
  13. dataset/audio/p227/p227_021.wav +0 -0
  14. dataset/audio/p228/p228_242.wav +0 -0
  15. dataset/audio/p229/p229_021.wav +0 -0
  16. dataset/audio/p230/p230_361.wav +0 -0
  17. dataset/audio/p231/p231_197.wav +0 -0
  18. dataset/audio/p232/p232_023.wav +0 -0
  19. dataset/audio/p233/p233_323.wav +0 -0
  20. dataset/audio/p234/p234_229.wav +0 -0
  21. dataset/audio/p236/p236_068.wav +0 -0
  22. dataset/audio/p237/p237_023.wav +0 -0
  23. dataset/audio/p238/p238_023.wav +0 -0
  24. dataset/audio/p239/p239_023.wav +0 -0
  25. dataset/audio/p240/p240_004.wav +0 -0
  26. dataset/audio/p241/p241_050.wav +0 -0
  27. dataset/audio/p243/p243_087.wav +0 -0
  28. dataset/audio/p244/p244_008.wav +0 -0
  29. dataset/audio/p245/p245_014.wav +0 -0
  30. dataset/audio/p246/p246_022.wav +0 -0
  31. dataset/audio/p247/p247_380.wav +0 -0
  32. dataset/audio/p248/p248_023.wav +0 -0
  33. dataset/audio/p249/p249_223.wav +0 -0
  34. dataset/audio/p250/p250_021.wav +0 -0
  35. dataset/audio/p251/p251_364.wav +0 -0
  36. dataset/audio/p252/p252_023.wav +0 -0
  37. dataset/audio/p253/p253_207.wav +0 -0
  38. dataset/audio/p254/p254_023.wav +0 -0
  39. dataset/audio/p255/p255_038.wav +0 -0
  40. dataset/audio/p256/p256_079.wav +0 -0
  41. dataset/audio/p257/p257_023.wav +0 -0
  42. dataset/audio/p258/p258_228.wav +0 -0
  43. dataset/audio/p259/p259_011.wav +0 -0
  44. dataset/audio/p260/p260_103.wav +0 -0
  45. dataset/audio/p261/p261_023.wav +0 -0
  46. dataset/audio/p262/p262_210.wav +0 -0
  47. dataset/audio/p263/p263_218.wav +0 -0
  48. dataset/audio/p264/p264_438.wav +0 -0
  49. dataset/audio/p265/p265_273.wav +0 -0
  50. dataset/audio/p266/p266_417.wav +0 -0
.gitattributes CHANGED
@@ -11,7 +11,7 @@
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
  *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ # *.npy filter=lfs diff=lfs merge=lfs -text
15
  *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ exp/default/g_00700000 filter=lfs diff=lfs merge=lfs -text
37
+ Utils/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ flagged
3
+ out.wav
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Jingyi Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -7,6 +7,7 @@ sdk: gradio
7
  sdk_version: 4.22.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 4.22.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Utils/JDC/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
Utils/JDC/bst.t7.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://github.com/yl4579/HiFTNet/blob/main/Utils/JDC/bst.t7
Utils/JDC/model.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of model from:
3
+ Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
4
+ Convolutional Recurrent Neural Networks" (2019)
5
+ Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
6
+ """
7
+ import torch
8
+ from torch import nn
9
+
10
+ class JDCNet(nn.Module):
11
+ """
12
+ Joint Detection and Classification Network model for singing voice melody.
13
+ """
14
+ def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01):
15
+ super().__init__()
16
+ self.num_class = num_class
17
+
18
+ # input = (b, 1, 31, 513), b = batch size
19
+ self.conv_block = nn.Sequential(
20
+ nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False), # out: (b, 64, 31, 513)
21
+ nn.BatchNorm2d(num_features=64),
22
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
23
+ nn.Conv2d(64, 64, 3, padding=1, bias=False), # (b, 64, 31, 513)
24
+ )
25
+
26
+ # res blocks
27
+ self.res_block1 = ResBlock(in_channels=64, out_channels=128) # (b, 128, 31, 128)
28
+ self.res_block2 = ResBlock(in_channels=128, out_channels=192) # (b, 192, 31, 32)
29
+ self.res_block3 = ResBlock(in_channels=192, out_channels=256) # (b, 256, 31, 8)
30
+
31
+ # pool block
32
+ self.pool_block = nn.Sequential(
33
+ nn.BatchNorm2d(num_features=256),
34
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
35
+ nn.MaxPool2d(kernel_size=(1, 4)), # (b, 256, 31, 2)
36
+ nn.Dropout(p=0.2),
37
+ )
38
+
39
+ # maxpool layers (for auxiliary network inputs)
40
+ # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
41
+ self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40))
42
+ # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
43
+ self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20))
44
+ # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
45
+ self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10))
46
+
47
+ # in = (b, 640, 31, 2), out = (b, 256, 31, 2)
48
+ self.detector_conv = nn.Sequential(
49
+ nn.Conv2d(640, 256, 1, bias=False),
50
+ nn.BatchNorm2d(256),
51
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
52
+ nn.Dropout(p=0.2),
53
+ )
54
+
55
+ # input: (b, 31, 512) - resized from (b, 256, 31, 2)
56
+ self.bilstm_classifier = nn.LSTM(
57
+ input_size=512, hidden_size=256,
58
+ batch_first=True, bidirectional=True) # (b, 31, 512)
59
+
60
+ # input: (b, 31, 512) - resized from (b, 256, 31, 2)
61
+ self.bilstm_detector = nn.LSTM(
62
+ input_size=512, hidden_size=256,
63
+ batch_first=True, bidirectional=True) # (b, 31, 512)
64
+
65
+ # input: (b * 31, 512)
66
+ self.classifier = nn.Linear(in_features=512, out_features=self.num_class) # (b * 31, num_class)
67
+
68
+ # input: (b * 31, 512)
69
+ self.detector = nn.Linear(in_features=512, out_features=2) # (b * 31, 2) - binary classifier
70
+
71
+ # initialize weights
72
+ self.apply(self.init_weights)
73
+
74
+ def get_feature_GAN(self, x):
75
+ seq_len = x.shape[-2]
76
+ x = x.float().transpose(-1, -2)
77
+
78
+ convblock_out = self.conv_block(x)
79
+
80
+ resblock1_out = self.res_block1(convblock_out)
81
+ resblock2_out = self.res_block2(resblock1_out)
82
+ resblock3_out = self.res_block3(resblock2_out)
83
+ poolblock_out = self.pool_block[0](resblock3_out)
84
+ poolblock_out = self.pool_block[1](poolblock_out)
85
+
86
+ return poolblock_out.transpose(-1, -2)
87
+
88
+ def get_feature(self, x):
89
+ seq_len = x.shape[-2]
90
+ x = x.float().transpose(-1, -2)
91
+
92
+ convblock_out = self.conv_block(x)
93
+
94
+ resblock1_out = self.res_block1(convblock_out)
95
+ resblock2_out = self.res_block2(resblock1_out)
96
+ resblock3_out = self.res_block3(resblock2_out)
97
+ poolblock_out = self.pool_block[0](resblock3_out)
98
+ poolblock_out = self.pool_block[1](poolblock_out)
99
+
100
+ return self.pool_block[2](poolblock_out)
101
+
102
+ def forward(self, x):
103
+ """
104
+ Returns:
105
+ classification_prediction, detection_prediction
106
+ sizes: (b, 31, 722), (b, 31, 2)
107
+ """
108
+ ###############################
109
+ # forward pass for classifier #
110
+ ###############################
111
+ seq_len = x.shape[-1]
112
+ x = x.float().transpose(-1, -2)
113
+
114
+ convblock_out = self.conv_block(x)
115
+
116
+ resblock1_out = self.res_block1(convblock_out)
117
+ resblock2_out = self.res_block2(resblock1_out)
118
+ resblock3_out = self.res_block3(resblock2_out)
119
+
120
+
121
+ poolblock_out = self.pool_block[0](resblock3_out)
122
+ poolblock_out = self.pool_block[1](poolblock_out)
123
+ GAN_feature = poolblock_out.transpose(-1, -2)
124
+ poolblock_out = self.pool_block[2](poolblock_out)
125
+
126
+ # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
127
+ classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512))
128
+ classifier_out, _ = self.bilstm_classifier(classifier_out) # ignore the hidden states
129
+
130
+ classifier_out = classifier_out.contiguous().view((-1, 512)) # (b * 31, 512)
131
+ classifier_out = self.classifier(classifier_out)
132
+ classifier_out = classifier_out.view((-1, seq_len, self.num_class)) # (b, 31, num_class)
133
+
134
+ # sizes: (b, 31, 722), (b, 31, 2)
135
+ # classifier output consists of predicted pitch classes per frame
136
+ # detector output consists of: (isvoice, notvoice) estimates per frame
137
+ return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
138
+
139
+ @staticmethod
140
+ def init_weights(m):
141
+ if isinstance(m, nn.Linear):
142
+ nn.init.kaiming_uniform_(m.weight)
143
+ if m.bias is not None:
144
+ nn.init.constant_(m.bias, 0)
145
+ elif isinstance(m, nn.Conv2d):
146
+ nn.init.xavier_normal_(m.weight)
147
+ elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
148
+ for p in m.parameters():
149
+ if p.data is None:
150
+ continue
151
+
152
+ if len(p.shape) >= 2:
153
+ nn.init.orthogonal_(p.data)
154
+ else:
155
+ nn.init.normal_(p.data)
156
+
157
+
158
+ class ResBlock(nn.Module):
159
+ def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01):
160
+ super().__init__()
161
+ self.downsample = in_channels != out_channels
162
+
163
+ # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
164
+ self.pre_conv = nn.Sequential(
165
+ nn.BatchNorm2d(num_features=in_channels),
166
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
167
+ nn.MaxPool2d(kernel_size=(1, 2)), # apply downsampling on the y axis only
168
+ )
169
+
170
+ # conv layers
171
+ self.conv = nn.Sequential(
172
+ nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
173
+ kernel_size=3, padding=1, bias=False),
174
+ nn.BatchNorm2d(out_channels),
175
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
176
+ nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
177
+ )
178
+
179
+ # 1 x 1 convolution layer to match the feature dimensions
180
+ self.conv1by1 = None
181
+ if self.downsample:
182
+ self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False)
183
+
184
+ def forward(self, x):
185
+ x = self.pre_conv(x)
186
+ if self.downsample:
187
+ x = self.conv(x) + self.conv1by1(x)
188
+ else:
189
+ x = self.conv(x) + x
190
+ return x
Utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import librosa
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import gradio as gr
11
+ from transformers import WavLMModel
12
+
13
+ from env import AttrDict
14
+ from meldataset import mel_spectrogram, MAX_WAV_VALUE
15
+ from models import Generator
16
+ from stft import TorchSTFT
17
+ from Utils.JDC.model import JDCNet
18
+
19
+
20
+ # files
21
+ hpfile = "config_v1_16k.json"
22
+ ptfile = "exp/default/g_00700000"
23
+ spk2id_path = "filelists/spk2id.json"
24
+ f0_stats_path = "filelists/f0_stats.json"
25
+ spk_stats_path = "filelists/spk_stats.json"
26
+ spk_emb_dir = "dataset/spk"
27
+ spk_wav_dir = "dataset/audio"
28
+
29
+ # device
30
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
+
32
+ # load config
33
+ with open(hpfile) as f:
34
+ data = f.read()
35
+ json_config = json.loads(data)
36
+ h = AttrDict(json_config)
37
+
38
+ # load models
39
+ F0_model = JDCNet(num_class=1, seq_len=192)
40
+ generator = Generator(h, F0_model).to(device)
41
+ stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft).to(device)
42
+
43
+ state_dict_g = torch.load(ptfile, map_location=device)
44
+ generator.load_state_dict(state_dict_g['generator'], strict=True)
45
+ generator.remove_weight_norm()
46
+ _ = generator.eval()
47
+
48
+ wavlm = WavLMModel.from_pretrained("microsoft/wavlm-base-plus")
49
+ wavlm.eval()
50
+ wavlm.to(device)
51
+
52
+ # load stats
53
+ with open(spk2id_path) as f:
54
+ spk2id = json.load(f)
55
+ with open(f0_stats_path) as f:
56
+ f0_stats = json.load(f)
57
+ with open(spk_stats_path) as f:
58
+ spk_stats = json.load(f)
59
+
60
+ # tune f0
61
+ threshold = 10
62
+ step = (math.log(1100) - math.log(50)) / 256
63
+ def tune_f0(initial_f0, i):
64
+ if i == 0:
65
+ return initial_f0
66
+ voiced = initial_f0 > threshold
67
+ initial_lf0 = torch.log(initial_f0)
68
+ lf0 = initial_lf0 + step * i
69
+ f0 = torch.exp(lf0)
70
+ f0 = torch.where(voiced, f0, initial_f0)
71
+ return f0
72
+
73
+ # convert function
74
+ def convert(tgt_spk, src_wav, f0_shift=0):
75
+ tgt_ref = spk_stats[tgt_spk]["best_spk_emb"]
76
+ tgt_emb = f"{spk_emb_dir}/{tgt_spk}/{tgt_ref}.npy"
77
+
78
+ with torch.no_grad():
79
+ # tgt
80
+ spk_id = spk2id[tgt_spk]
81
+ spk_id = torch.LongTensor([spk_id]).unsqueeze(0).to(device)
82
+
83
+ spk_emb = np.load(tgt_emb)
84
+ spk_emb = torch.from_numpy(spk_emb).unsqueeze(0).to(device)
85
+
86
+ f0_mean_tgt = f0_stats[tgt_spk]["mean"]
87
+
88
+ # src
89
+ wav, sr = librosa.load(src_wav, sr=16000)
90
+ wav = torch.FloatTensor(wav).to(device)
91
+ mel = mel_spectrogram(wav.unsqueeze(0), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
92
+
93
+ x = wavlm(wav.unsqueeze(0)).last_hidden_state
94
+ x = x.transpose(1, 2) # (B, C, T)
95
+ x = F.pad(x, (0, mel.size(2) - x.size(2)), 'constant')
96
+
97
+ # cvt
98
+ f0 = generator.get_f0(mel, f0_mean_tgt)
99
+ f0 = tune_f0(f0, f0_shift)
100
+ x = generator.get_x(x, spk_emb, spk_id)
101
+ y = generator.infer(x, f0, stft)
102
+
103
+ audio = y.squeeze()
104
+ audio = audio / torch.max(torch.abs(audio)) * 0.95
105
+ audio = audio * MAX_WAV_VALUE
106
+ audio = audio.cpu().numpy().astype('int16')
107
+
108
+ sf.write("out.wav", audio, h.sampling_rate, "PCM_16")
109
+
110
+ out_wav = "out.wav"
111
+ return out_wav
112
+
113
+ # change spk
114
+ def change_spk(tgt_spk):
115
+ tgt_ref = spk_stats[tgt_spk]["best_spk_emb"]
116
+ tgt_wav = f"{spk_wav_dir}/{tgt_spk}/{tgt_ref}.wav"
117
+ return tgt_wav
118
+
119
+ # interface
120
+ with gr.Blocks() as demo:
121
+ gr.Markdown("# PitchVC")
122
+ gr.Markdown("Gradio Demo for PitchVC. ([Github Repo](https://github.com/OlaWod/PitchVC))")
123
+
124
+ with gr.Row():
125
+ with gr.Column():
126
+ tgt_spk = gr.Dropdown(choices=spk2id.keys(), type="value", label="Target Speaker")
127
+ ref_audio = gr.Audio(label="Reference Audio", type='filepath')
128
+ src_audio = gr.Audio(label="Source Audio", type='filepath')
129
+ f0_shift = gr.Slider(minimum=-30, maximum=30, value=0, step=1, label="F0 Shift")
130
+ with gr.Column():
131
+ out_audio = gr.Audio(label="Output Audio", type='filepath')
132
+ submit = gr.Button(value="Submit")
133
+
134
+ tgt_spk.change(fn=change_spk, inputs=[tgt_spk], outputs=[ref_audio])
135
+ submit.click(convert, [tgt_spk, src_audio, f0_shift], [out_audio])
136
+
137
+ examples = gr.Examples(
138
+ examples=[["p225", 'dataset/audio/p226/p226_341.wav', 0],
139
+ ["p226", 'dataset/audio/p225/p225_220.wav', -5]],
140
+ inputs=[tgt_spk, src_audio, f0_shift])
141
+
142
+ demo.launch()
config_v1_16k.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "F0_path": "Utils/JDC/bst.t7",
3
+
4
+ "use_aug": true,
5
+
6
+ "resblock": "1",
7
+ "num_gpus": 1,
8
+ "batch_size": 16,
9
+ "learning_rate": 0.0002,
10
+ "adam_b1": 0.8,
11
+ "adam_b2": 0.99,
12
+ "lr_decay": 0.999,
13
+ "seed": 1234,
14
+
15
+ "upsample_rates": [10,8],
16
+ "upsample_kernel_sizes": [20,16],
17
+ "upsample_initial_channel": 512,
18
+ "resblock_kernel_sizes": [3,7,11],
19
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
20
+ "gen_istft_n_fft": 16,
21
+ "gen_istft_hop_size": 4,
22
+
23
+ "segment_size": 16000,
24
+ "num_mels": 80,
25
+ "n_fft": 1024,
26
+ "hop_size": 320,
27
+ "win_size": 1024,
28
+
29
+ "sampling_rate": 16000,
30
+
31
+ "fmin": 0,
32
+ "fmax": 8000,
33
+ "fmax_for_loss": null,
34
+
35
+ "num_workers": 8,
36
+
37
+ "dist_config": {
38
+ "dist_backend": "nccl",
39
+ "dist_url": "tcp://localhost:54321",
40
+ "world_size": 1
41
+ }
42
+ }
dataset/audio/p225/p225_220.wav ADDED
Binary file (101 kB). View file
 
dataset/audio/p226/p226_341.wav ADDED
Binary file (93.3 kB). View file
 
dataset/audio/p227/p227_021.wav ADDED
Binary file (294 kB). View file
 
dataset/audio/p228/p228_242.wav ADDED
Binary file (87.1 kB). View file
 
dataset/audio/p229/p229_021.wav ADDED
Binary file (239 kB). View file
 
dataset/audio/p230/p230_361.wav ADDED
Binary file (132 kB). View file
 
dataset/audio/p231/p231_197.wav ADDED
Binary file (36.9 kB). View file
 
dataset/audio/p232/p232_023.wav ADDED
Binary file (285 kB). View file
 
dataset/audio/p233/p233_323.wav ADDED
Binary file (133 kB). View file
 
dataset/audio/p234/p234_229.wav ADDED
Binary file (73.8 kB). View file
 
dataset/audio/p236/p236_068.wav ADDED
Binary file (89.2 kB). View file
 
dataset/audio/p237/p237_023.wav ADDED
Binary file (272 kB). View file
 
dataset/audio/p238/p238_023.wav ADDED
Binary file (372 kB). View file
 
dataset/audio/p239/p239_023.wav ADDED
Binary file (265 kB). View file
 
dataset/audio/p240/p240_004.wav ADDED
Binary file (119 kB). View file
 
dataset/audio/p241/p241_050.wav ADDED
Binary file (64.6 kB). View file
 
dataset/audio/p243/p243_087.wav ADDED
Binary file (109 kB). View file
 
dataset/audio/p244/p244_008.wav ADDED
Binary file (225 kB). View file
 
dataset/audio/p245/p245_014.wav ADDED
Binary file (154 kB). View file
 
dataset/audio/p246/p246_022.wav ADDED
Binary file (196 kB). View file
 
dataset/audio/p247/p247_380.wav ADDED
Binary file (92.2 kB). View file
 
dataset/audio/p248/p248_023.wav ADDED
Binary file (396 kB). View file
 
dataset/audio/p249/p249_223.wav ADDED
Binary file (116 kB). View file
 
dataset/audio/p250/p250_021.wav ADDED
Binary file (225 kB). View file
 
dataset/audio/p251/p251_364.wav ADDED
Binary file (128 kB). View file
 
dataset/audio/p252/p252_023.wav ADDED
Binary file (324 kB). View file
 
dataset/audio/p253/p253_207.wav ADDED
Binary file (101 kB). View file
 
dataset/audio/p254/p254_023.wav ADDED
Binary file (286 kB). View file
 
dataset/audio/p255/p255_038.wav ADDED
Binary file (114 kB). View file
 
dataset/audio/p256/p256_079.wav ADDED
Binary file (119 kB). View file
 
dataset/audio/p257/p257_023.wav ADDED
Binary file (242 kB). View file
 
dataset/audio/p258/p258_228.wav ADDED
Binary file (89.2 kB). View file
 
dataset/audio/p259/p259_011.wav ADDED
Binary file (191 kB). View file
 
dataset/audio/p260/p260_103.wav ADDED
Binary file (121 kB). View file
 
dataset/audio/p261/p261_023.wav ADDED
Binary file (286 kB). View file
 
dataset/audio/p262/p262_210.wav ADDED
Binary file (118 kB). View file
 
dataset/audio/p263/p263_218.wav ADDED
Binary file (101 kB). View file
 
dataset/audio/p264/p264_438.wav ADDED
Binary file (125 kB). View file
 
dataset/audio/p265/p265_273.wav ADDED
Binary file (119 kB). View file
 
dataset/audio/p266/p266_417.wav ADDED
Binary file (89.2 kB). View file