darksakura commited on
Commit
5ae69c3
1 Parent(s): 01dcf5d

Upload 91 files

Browse files
data_utils.py CHANGED
@@ -48,8 +48,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
48
  filename = filename.replace("\\", "/")
49
  audio, sampling_rate = load_wav_to_torch(filename)
50
  if sampling_rate != self.sampling_rate:
51
- raise ValueError("{} SR doesn't match target {} SR".format(
52
- sampling_rate, self.sampling_rate))
 
53
  audio_norm = audio / self.max_wav_value
54
  audio_norm = audio_norm.unsqueeze(0)
55
  spec_filename = filename.replace(".wav", ".spec.pt")
 
48
  filename = filename.replace("\\", "/")
49
  audio, sampling_rate = load_wav_to_torch(filename)
50
  if sampling_rate != self.sampling_rate:
51
+ raise ValueError(
52
+ "Sample Rate not match. Expect {} but got {} from {}".format(
53
+ self.sampling_rate, sampling_rate, filename))
54
  audio_norm = audio / self.max_wav_value
55
  audio_norm = audio_norm.unsqueeze(0)
56
  spec_filename = filename.replace(".wav", ".spec.pt")
inference/infer_tool.py CHANGED
@@ -151,6 +151,7 @@ class Svc(object):
151
  self.target_sample = self.diffusion_args.data.sampling_rate
152
  self.hop_size = self.diffusion_args.data.block_size
153
  self.spk2id = self.diffusion_args.spk
 
154
  self.speech_encoder = self.diffusion_args.data.encoder
155
  self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
156
  if spk_mix_enable:
@@ -202,9 +203,10 @@ class Svc(object):
202
 
203
  def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
204
 
205
- f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
206
-
207
- f0, uv = f0_predictor_object.compute_f0_uv(wav)
 
208
  if f0_filter and sum(f0) == 0:
209
  raise F0FilterException("No voice detected")
210
  f0 = torch.FloatTensor(f0).to(self.dev)
@@ -214,21 +216,24 @@ class Svc(object):
214
  f0 = f0.unsqueeze(0)
215
  uv = uv.unsqueeze(0)
216
 
217
- wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
218
- wav16k = torch.from_numpy(wav16k).to(self.dev)
 
 
 
219
  c = self.hubert_model.encoder(wav16k)
220
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
221
 
222
  if cluster_infer_ratio !=0:
223
  if self.feature_retrieval:
224
  speaker_id = self.spk2id.get(speaker)
225
- if speaker_id is None:
226
- raise RuntimeError("The name you entered is not in the speaker list!")
227
  if not speaker_id and type(speaker) is int:
228
  if len(self.spk2id.__dict__) >= speaker:
229
  speaker_id = speaker
 
 
230
  feature_index = self.cluster_model[speaker_id]
231
- feat_np = c.transpose(0,1).cpu().numpy()
232
  if self.big_npy is None or self.now_spk_id != speaker_id:
233
  self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
234
  self.now_spk_id = speaker_id
@@ -247,7 +252,7 @@ class Svc(object):
247
 
248
  c = c.unsqueeze(0)
249
  return c, f0, uv
250
-
251
  def infer(self, speaker, tran, raw_path,
252
  cluster_infer_ratio=0,
253
  auto_predict_f0=False,
@@ -262,7 +267,10 @@ class Svc(object):
262
  second_encoding = False,
263
  loudness_envelope_adjustment = 1
264
  ):
265
- wav, sr = librosa.load(raw_path, sr=self.target_sample)
 
 
 
266
  if spk_mix:
267
  c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
268
  n_frames = f0.size(1)
@@ -298,8 +306,9 @@ class Svc(object):
298
  if self.only_diffusion or self.shallow_diffusion:
299
  vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
300
  if self.shallow_diffusion and second_encoding:
301
- audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
302
- audio16k = torch.from_numpy(audio16k).to(self.dev)
 
303
  c = self.hubert_model.encoder(audio16k)
304
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
305
  f0 = f0[:,:,None]
 
151
  self.target_sample = self.diffusion_args.data.sampling_rate
152
  self.hop_size = self.diffusion_args.data.block_size
153
  self.spk2id = self.diffusion_args.spk
154
+ self.dtype = torch.float32
155
  self.speech_encoder = self.diffusion_args.data.encoder
156
  self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
157
  if spk_mix_enable:
 
203
 
204
  def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
205
 
206
+ if not hasattr(self,"f0_predictor_object") or self.f0_predictor_object is None or f0_predictor != self.f0_predictor_object.name:
207
+ self.f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
208
+ f0, uv = self.f0_predictor_object.compute_f0_uv(wav)
209
+
210
  if f0_filter and sum(f0) == 0:
211
  raise F0FilterException("No voice detected")
212
  f0 = torch.FloatTensor(f0).to(self.dev)
 
216
  f0 = f0.unsqueeze(0)
217
  uv = uv.unsqueeze(0)
218
 
219
+ wav = torch.from_numpy(wav).to(self.dev)
220
+ if not hasattr(self,"audio16k_resample_transform"):
221
+ self.audio16k_resample_transform = torchaudio.transforms.Resample(self.target_sample, 16000).to(self.dev)
222
+ wav16k = self.audio16k_resample_transform(wav[None,:])[0]
223
+
224
  c = self.hubert_model.encoder(wav16k)
225
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
226
 
227
  if cluster_infer_ratio !=0:
228
  if self.feature_retrieval:
229
  speaker_id = self.spk2id.get(speaker)
 
 
230
  if not speaker_id and type(speaker) is int:
231
  if len(self.spk2id.__dict__) >= speaker:
232
  speaker_id = speaker
233
+ if speaker_id is None:
234
+ raise RuntimeError("The name you entered is not in the speaker list!")
235
  feature_index = self.cluster_model[speaker_id]
236
+ feat_np = np.ascontiguousarray(c.transpose(0,1).cpu().numpy())
237
  if self.big_npy is None or self.now_spk_id != speaker_id:
238
  self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
239
  self.now_spk_id = speaker_id
 
252
 
253
  c = c.unsqueeze(0)
254
  return c, f0, uv
255
+
256
  def infer(self, speaker, tran, raw_path,
257
  cluster_infer_ratio=0,
258
  auto_predict_f0=False,
 
267
  second_encoding = False,
268
  loudness_envelope_adjustment = 1
269
  ):
270
+ wav, sr = torchaudio.load(raw_path)
271
+ if not hasattr(self,"audio_resample_transform") or self.audio16k_resample_transform.orig_freq != sr:
272
+ self.audio_resample_transform = torchaudio.transforms.Resample(sr,self.target_sample)
273
+ wav = self.audio_resample_transform(wav).numpy()[0]
274
  if spk_mix:
275
  c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
276
  n_frames = f0.size(1)
 
306
  if self.only_diffusion or self.shallow_diffusion:
307
  vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
308
  if self.shallow_diffusion and second_encoding:
309
+ if not hasattr(self,"audio16k_resample_transform"):
310
+ self.audio16k_resample_transform = torchaudio.transforms.Resample(self.target_sample, 16000).to(self.dev)
311
+ audio16k = self.audio16k_resample_transform(audio[None,:])[0]
312
  c = self.hubert_model.encoder(audio16k)
313
  c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
314
  f0 = f0[:,:,None]
models.py CHANGED
@@ -20,7 +20,9 @@ class ResidualCouplingBlock(nn.Module):
20
  dilation_rate,
21
  n_layers,
22
  n_flows=4,
23
- gin_channels=0):
 
 
24
  super().__init__()
25
  self.channels = channels
26
  self.hidden_channels = hidden_channels
@@ -31,10 +33,13 @@ class ResidualCouplingBlock(nn.Module):
31
  self.gin_channels = gin_channels
32
 
33
  self.flows = nn.ModuleList()
 
 
 
34
  for i in range(n_flows):
35
  self.flows.append(
36
  modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
37
- gin_channels=gin_channels, mean_only=True))
38
  self.flows.append(modules.Flip())
39
 
40
  def forward(self, x, x_mask, g=None, reverse=False):
@@ -320,6 +325,7 @@ class SynthesizerTrn(nn.Module):
320
  vocoder_name = "nsf-hifigan",
321
  use_depthwise_conv = False,
322
  use_automatic_f0_prediction = True,
 
323
  n_flow_layer = 4,
324
  **kwargs):
325
 
@@ -386,7 +392,7 @@ class SynthesizerTrn(nn.Module):
386
  self.dec = Generator(h=hps)
387
 
388
  self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
389
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels)
390
  if self.use_automatic_f0_prediction:
391
  self.f0_decoder = F0Decoder(
392
  1,
 
20
  dilation_rate,
21
  n_layers,
22
  n_flows=4,
23
+ gin_channels=0,
24
+ share_parameter=False
25
+ ):
26
  super().__init__()
27
  self.channels = channels
28
  self.hidden_channels = hidden_channels
 
33
  self.gin_channels = gin_channels
34
 
35
  self.flows = nn.ModuleList()
36
+
37
+ self.wn = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=gin_channels) if share_parameter else None
38
+
39
  for i in range(n_flows):
40
  self.flows.append(
41
  modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
42
+ gin_channels=gin_channels, mean_only=True, wn_sharing_parameter=self.wn))
43
  self.flows.append(modules.Flip())
44
 
45
  def forward(self, x, x_mask, g=None, reverse=False):
 
325
  vocoder_name = "nsf-hifigan",
326
  use_depthwise_conv = False,
327
  use_automatic_f0_prediction = True,
328
+ flow_share_parameter = False,
329
  n_flow_layer = 4,
330
  **kwargs):
331
 
 
392
  self.dec = Generator(h=hps)
393
 
394
  self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
395
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels, share_parameter= flow_share_parameter)
396
  if self.use_automatic_f0_prediction:
397
  self.f0_decoder = F0Decoder(
398
  1,
modules/F0Predictor/CrepeF0Predictor.py CHANGED
@@ -13,6 +13,7 @@ class CrepeF0Predictor(F0Predictor):
13
  self.device = device
14
  self.threshold = threshold
15
  self.sampling_rate = sampling_rate
 
16
 
17
  def compute_f0(self,wav,p_len=None):
18
  x = torch.FloatTensor(wav).to(self.device)
 
13
  self.device = device
14
  self.threshold = threshold
15
  self.sampling_rate = sampling_rate
16
+ self.name = "crepe"
17
 
18
  def compute_f0(self,wav,p_len=None):
19
  x = torch.FloatTensor(wav).to(self.device)
modules/F0Predictor/DioF0Predictor.py CHANGED
@@ -10,6 +10,7 @@ class DioF0Predictor(F0Predictor):
10
  self.f0_min = f0_min
11
  self.f0_max = f0_max
12
  self.sampling_rate = sampling_rate
 
13
 
14
  def interpolate_f0(self,f0):
15
  '''
 
10
  self.f0_min = f0_min
11
  self.f0_max = f0_max
12
  self.sampling_rate = sampling_rate
13
+ self.name = "dio"
14
 
15
  def interpolate_f0(self,f0):
16
  '''
modules/F0Predictor/FCPEF0Predictor.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn.functional as F
6
+
7
+ from modules.F0Predictor.F0Predictor import F0Predictor
8
+
9
+ from .fcpe.model import FCPEInfer
10
+
11
+
12
+ class FCPEF0Predictor(F0Predictor):
13
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, dtype=torch.float32, device=None, sampling_rate=44100,
14
+ threshold=0.05):
15
+ self.fcpe = FCPEInfer(model_path="pretrain/fcpe.pt", device=device, dtype=dtype)
16
+ self.hop_length = hop_length
17
+ self.f0_min = f0_min
18
+ self.f0_max = f0_max
19
+ if device is None:
20
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
21
+ else:
22
+ self.device = device
23
+ self.threshold = threshold
24
+ self.sampling_rate = sampling_rate
25
+ self.dtype = dtype
26
+ self.name = "fcpe"
27
+
28
+ def repeat_expand(
29
+ self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"
30
+ ):
31
+ ndim = content.ndim
32
+
33
+ if content.ndim == 1:
34
+ content = content[None, None]
35
+ elif content.ndim == 2:
36
+ content = content[None]
37
+
38
+ assert content.ndim == 3
39
+
40
+ is_np = isinstance(content, np.ndarray)
41
+ if is_np:
42
+ content = torch.from_numpy(content)
43
+
44
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
45
+
46
+ if is_np:
47
+ results = results.numpy()
48
+
49
+ if ndim == 1:
50
+ return results[0, 0]
51
+ elif ndim == 2:
52
+ return results[0]
53
+
54
+ def post_process(self, x, sampling_rate, f0, pad_to):
55
+ if isinstance(f0, np.ndarray):
56
+ f0 = torch.from_numpy(f0).float().to(x.device)
57
+
58
+ if pad_to is None:
59
+ return f0
60
+
61
+ f0 = self.repeat_expand(f0, pad_to)
62
+
63
+ vuv_vector = torch.zeros_like(f0)
64
+ vuv_vector[f0 > 0.0] = 1.0
65
+ vuv_vector[f0 <= 0.0] = 0.0
66
+
67
+ # 去掉0频率, 并线性插值
68
+ nzindex = torch.nonzero(f0).squeeze()
69
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
70
+ time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
71
+ time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
72
+
73
+ vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
74
+
75
+ if f0.shape[0] <= 0:
76
+ return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(), vuv_vector.cpu().numpy()
77
+ if f0.shape[0] == 1:
78
+ return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[
79
+ 0]).cpu().numpy(), vuv_vector.cpu().numpy()
80
+
81
+ # 大概可以用 torch 重写?
82
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
83
+ # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
84
+
85
+ return f0, vuv_vector.cpu().numpy()
86
+
87
+ def compute_f0(self, wav, p_len=None):
88
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
89
+ if p_len is None:
90
+ p_len = x.shape[0] // self.hop_length
91
+ else:
92
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
93
+ f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0]
94
+ if torch.all(f0 == 0):
95
+ rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
96
+ return rtn, rtn
97
+ return self.post_process(x, self.sampling_rate, f0, p_len)[0]
98
+
99
+ def compute_f0_uv(self, wav, p_len=None):
100
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
101
+ if p_len is None:
102
+ p_len = x.shape[0] // self.hop_length
103
+ else:
104
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
105
+ f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0,:,0]
106
+ if torch.all(f0 == 0):
107
+ rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
108
+ return rtn, rtn
109
+ return self.post_process(x, self.sampling_rate, f0, p_len)
modules/F0Predictor/HarvestF0Predictor.py CHANGED
@@ -10,6 +10,7 @@ class HarvestF0Predictor(F0Predictor):
10
  self.f0_min = f0_min
11
  self.f0_max = f0_max
12
  self.sampling_rate = sampling_rate
 
13
 
14
  def interpolate_f0(self,f0):
15
  '''
 
10
  self.f0_min = f0_min
11
  self.f0_max = f0_max
12
  self.sampling_rate = sampling_rate
13
+ self.name = "harvest"
14
 
15
  def interpolate_f0(self,f0):
16
  '''
modules/F0Predictor/PMF0Predictor.py CHANGED
@@ -10,7 +10,7 @@ class PMF0Predictor(F0Predictor):
10
  self.f0_min = f0_min
11
  self.f0_max = f0_max
12
  self.sampling_rate = sampling_rate
13
-
14
 
15
  def interpolate_f0(self,f0):
16
  '''
 
10
  self.f0_min = f0_min
11
  self.f0_max = f0_max
12
  self.sampling_rate = sampling_rate
13
+ self.name = "pm"
14
 
15
  def interpolate_f0(self,f0):
16
  '''
modules/F0Predictor/RMVPEF0Predictor.py CHANGED
@@ -16,13 +16,13 @@ class RMVPEF0Predictor(F0Predictor):
16
  self.f0_min = f0_min
17
  self.f0_max = f0_max
18
  if device is None:
19
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
- #self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
21
  else:
22
- self.dev = torch.device("cpu")
23
  self.threshold = threshold
24
  self.sampling_rate = sampling_rate
25
  self.dtype = dtype
 
26
 
27
  def repeat_expand(
28
  self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"
@@ -72,9 +72,9 @@ class RMVPEF0Predictor(F0Predictor):
72
  vuv_vector = F.interpolate(vuv_vector[None,None,:],size=pad_to)[0][0]
73
 
74
  if f0.shape[0] <= 0:
75
- return torch.zeros(pad_to, dtype=torch.float, device=x.device),vuv_vector.cpu().numpy()
76
  if f0.shape[0] == 1:
77
- return torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0],vuv_vector.cpu().numpy()
78
 
79
  # 大概可以用 torch 重写?
80
  f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
@@ -104,4 +104,4 @@ class RMVPEF0Predictor(F0Predictor):
104
  if torch.all(f0 == 0):
105
  rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
106
  return rtn,rtn
107
- return self.post_process(x,self.sampling_rate,f0,p_len)
 
16
  self.f0_min = f0_min
17
  self.f0_max = f0_max
18
  if device is None:
19
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
20
  else:
21
+ self.device = device
22
  self.threshold = threshold
23
  self.sampling_rate = sampling_rate
24
  self.dtype = dtype
25
+ self.name = "rmvpe"
26
 
27
  def repeat_expand(
28
  self, content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"
 
72
  vuv_vector = F.interpolate(vuv_vector[None,None,:],size=pad_to)[0][0]
73
 
74
  if f0.shape[0] <= 0:
75
+ return torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(),vuv_vector.cpu().numpy()
76
  if f0.shape[0] == 1:
77
+ return (torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0]).cpu().numpy() ,vuv_vector.cpu().numpy()
78
 
79
  # 大概可以用 torch 重写?
80
  f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
 
104
  if torch.all(f0 == 0):
105
  rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
106
  return rtn,rtn
107
+ return self.post_process(x,self.sampling_rate,f0,p_len)
modules/F0Predictor/fcpe/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .model import FCPEInfer # noqa: F401
2
+ from .nvSTFT import STFT # noqa: F401
3
+ from .pcmer import PCmer # noqa: F401
modules/F0Predictor/fcpe/model.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.nn.utils import weight_norm
6
+ from torchaudio.transforms import Resample
7
+
8
+ from .nvSTFT import STFT
9
+ from .pcmer import PCmer
10
+
11
+
12
+ def l2_regularization(model, l2_alpha):
13
+ l2_loss = []
14
+ for module in model.modules():
15
+ if type(module) is nn.Conv2d:
16
+ l2_loss.append((module.weight ** 2).sum() / 2.0)
17
+ return l2_alpha * sum(l2_loss)
18
+
19
+
20
+ class FCPE(nn.Module):
21
+ def __init__(
22
+ self,
23
+ input_channel=128,
24
+ out_dims=360,
25
+ n_layers=12,
26
+ n_chans=512,
27
+ use_siren=False,
28
+ use_full=False,
29
+ loss_mse_scale=10,
30
+ loss_l2_regularization=False,
31
+ loss_l2_regularization_scale=1,
32
+ loss_grad1_mse=False,
33
+ loss_grad1_mse_scale=1,
34
+ f0_max=1975.5,
35
+ f0_min=32.70,
36
+ confidence=False,
37
+ threshold=0.05,
38
+ use_input_conv=True
39
+ ):
40
+ super().__init__()
41
+ if use_siren is True:
42
+ raise ValueError("Siren is not supported yet.")
43
+ if use_full is True:
44
+ raise ValueError("Full model is not supported yet.")
45
+
46
+ self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
47
+ self.loss_l2_regularization = loss_l2_regularization if (loss_l2_regularization is not None) else False
48
+ self.loss_l2_regularization_scale = loss_l2_regularization_scale if (loss_l2_regularization_scale
49
+ is not None) else 1
50
+ self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
51
+ self.loss_grad1_mse_scale = loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
52
+ self.f0_max = f0_max if (f0_max is not None) else 1975.5
53
+ self.f0_min = f0_min if (f0_min is not None) else 32.70
54
+ self.confidence = confidence if (confidence is not None) else False
55
+ self.threshold = threshold if (threshold is not None) else 0.05
56
+ self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
57
+
58
+ self.cent_table_b = torch.Tensor(
59
+ np.linspace(self.f0_to_cent(torch.Tensor([f0_min]))[0], self.f0_to_cent(torch.Tensor([f0_max]))[0],
60
+ out_dims))
61
+ self.register_buffer("cent_table", self.cent_table_b)
62
+
63
+ # conv in stack
64
+ _leaky = nn.LeakyReLU()
65
+ self.stack = nn.Sequential(
66
+ nn.Conv1d(input_channel, n_chans, 3, 1, 1),
67
+ nn.GroupNorm(4, n_chans),
68
+ _leaky,
69
+ nn.Conv1d(n_chans, n_chans, 3, 1, 1))
70
+
71
+ # transformer
72
+ self.decoder = PCmer(
73
+ num_layers=n_layers,
74
+ num_heads=8,
75
+ dim_model=n_chans,
76
+ dim_keys=n_chans,
77
+ dim_values=n_chans,
78
+ residual_dropout=0.1,
79
+ attention_dropout=0.1)
80
+ self.norm = nn.LayerNorm(n_chans)
81
+
82
+ # out
83
+ self.n_out = out_dims
84
+ self.dense_out = weight_norm(
85
+ nn.Linear(n_chans, self.n_out))
86
+
87
+ def forward(self, mel, infer=True, gt_f0=None, return_hz_f0=False):
88
+ """
89
+ input:
90
+ B x n_frames x n_unit
91
+ return:
92
+ dict of B x n_frames x feat
93
+ """
94
+ if self.use_input_conv:
95
+ x = self.stack(mel.transpose(1, 2)).transpose(1, 2)
96
+ else:
97
+ x = mel
98
+ x = self.decoder(x)
99
+ x = self.norm(x)
100
+ x = self.dense_out(x) # [B,N,D]
101
+ x = torch.sigmoid(x)
102
+ if not infer:
103
+ gt_cent_f0 = self.f0_to_cent(gt_f0) # mel f0 #[B,N,1]
104
+ gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) # #[B,N,out_dim]
105
+ loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0) # bce loss
106
+ # l2 regularization
107
+ if self.loss_l2_regularization:
108
+ loss_all = loss_all + l2_regularization(model=self, l2_alpha=self.loss_l2_regularization_scale)
109
+ x = loss_all
110
+ if infer:
111
+ x = self.cents_decoder(x)
112
+ x = self.cent_to_f0(x)
113
+ if not return_hz_f0:
114
+ x = (1 + x / 700).log()
115
+ return x
116
+
117
+ def cents_decoder(self, y, mask=True):
118
+ B, N, _ = y.size()
119
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
120
+ rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(y, dim=-1, keepdim=True) # cents: [B,N,1]
121
+ if mask:
122
+ confident = torch.max(y, dim=-1, keepdim=True)[0]
123
+ confident_mask = torch.ones_like(confident)
124
+ confident_mask[confident <= self.threshold] = float("-INF")
125
+ rtn = rtn * confident_mask
126
+ if self.confidence:
127
+ return rtn, confident
128
+ else:
129
+ return rtn
130
+
131
+ def cent_to_f0(self, cent):
132
+ return 10. * 2 ** (cent / 1200.)
133
+
134
+ def f0_to_cent(self, f0):
135
+ return 1200. * torch.log2(f0 / 10.)
136
+
137
+ def gaussian_blurred_cent(self, cents): # cents: [B,N,1]
138
+ mask = (cents > 0.1) & (cents < (1200. * np.log2(self.f0_max / 10.)))
139
+ B, N, _ = cents.size()
140
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
141
+ return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
142
+
143
+
144
+ class FCPEInfer:
145
+ def __init__(self, model_path, device=None, dtype=torch.float32):
146
+ if device is None:
147
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
148
+ self.device = device
149
+ ckpt = torch.load(model_path, map_location=torch.device(self.device))
150
+ self.args = DotDict(ckpt["config"])
151
+ self.dtype = dtype
152
+ model = FCPE(
153
+ input_channel=self.args.model.input_channel,
154
+ out_dims=self.args.model.out_dims,
155
+ n_layers=self.args.model.n_layers,
156
+ n_chans=self.args.model.n_chans,
157
+ use_siren=self.args.model.use_siren,
158
+ use_full=self.args.model.use_full,
159
+ loss_mse_scale=self.args.loss.loss_mse_scale,
160
+ loss_l2_regularization=self.args.loss.loss_l2_regularization,
161
+ loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
162
+ loss_grad1_mse=self.args.loss.loss_grad1_mse,
163
+ loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
164
+ f0_max=self.args.model.f0_max,
165
+ f0_min=self.args.model.f0_min,
166
+ confidence=self.args.model.confidence,
167
+ )
168
+ ckpt = torch.load(model_path, map_location=torch.device(self.device))
169
+ model.to(self.device).to(self.dtype)
170
+ model.load_state_dict(ckpt['model'])
171
+ model.eval()
172
+ self.model = model
173
+ self.wav2mel = Wav2Mel(self.args)
174
+
175
+ @torch.no_grad()
176
+ def __call__(self, audio, sr, threshold=0.05):
177
+ self.model.threshold = threshold
178
+ audio = audio[None,:]
179
+ mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
180
+ f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
181
+ return f0
182
+
183
+
184
+ class Wav2Mel:
185
+ def __init__(self, args, device=None):
186
+ # self.args = args
187
+ self.sampling_rate = args.mel.sampling_rate
188
+ self.hop_size = args.mel.hop_size
189
+ if device is None:
190
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
191
+ self.device = device
192
+ self.stft = STFT(
193
+ args.mel.sampling_rate,
194
+ args.mel.num_mels,
195
+ args.mel.n_fft,
196
+ args.mel.win_size,
197
+ args.mel.hop_size,
198
+ args.mel.fmin,
199
+ args.mel.fmax
200
+ )
201
+ self.resample_kernel = {}
202
+
203
+ def extract_nvstft(self, audio, keyshift=0, train=False):
204
+ mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) # B, n_frames, bins
205
+ return mel
206
+
207
+ def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
208
+ # resample
209
+ if sample_rate == self.sampling_rate:
210
+ audio_res = audio
211
+ else:
212
+ key_str = str(sample_rate)
213
+ if key_str not in self.resample_kernel:
214
+ self.resample_kernel[key_str] = Resample(sample_rate, self.sampling_rate,
215
+ lowpass_filter_width=128).to(self.device)
216
+ audio_res = self.resample_kernel[key_str](audio)
217
+
218
+ # extract
219
+ mel = self.extract_nvstft(audio_res, keyshift=keyshift, train=train) # B, n_frames, bins
220
+ n_frames = int(audio.shape[1] // self.hop_size) + 1
221
+ if n_frames > int(mel.shape[1]):
222
+ mel = torch.cat((mel, mel[:, -1:, :]), 1)
223
+ if n_frames < int(mel.shape[1]):
224
+ mel = mel[:, :n_frames, :]
225
+ return mel
226
+
227
+ def __call__(self, audio, sample_rate, keyshift=0, train=False):
228
+ return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
229
+
230
+
231
+ class DotDict(dict):
232
+ def __getattr__(*args):
233
+ val = dict.get(*args)
234
+ return DotDict(val) if type(val) is dict else val
235
+
236
+ __setattr__ = dict.__setitem__
237
+ __delattr__ = dict.__delitem__
modules/F0Predictor/fcpe/nvSTFT.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import torch
7
+ import torch.nn.functional as F
8
+ import torch.utils.data
9
+ from librosa.filters import mel as librosa_mel_fn
10
+
11
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
12
+
13
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
14
+ sampling_rate = None
15
+ try:
16
+ data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
17
+ except Exception as ex:
18
+ print(f"'{full_path}' failed to load.\nException:")
19
+ print(ex)
20
+ if return_empty_on_exception:
21
+ return [], sampling_rate or target_sr or 48000
22
+ else:
23
+ raise Exception(ex)
24
+
25
+ if len(data.shape) > 1:
26
+ data = data[:, 0]
27
+ assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
28
+
29
+ if np.issubdtype(data.dtype, np.integer): # if audio data is type int
30
+ max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
31
+ else: # if audio data is type fp32
32
+ max_mag = max(np.amax(data), -np.amin(data))
33
+ max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
34
+
35
+ data = torch.FloatTensor(data.astype(np.float32))/max_mag
36
+
37
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
38
+ return [], sampling_rate or target_sr or 48000
39
+ if target_sr is not None and sampling_rate != target_sr:
40
+ data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
41
+ sampling_rate = target_sr
42
+
43
+ return data, sampling_rate
44
+
45
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
46
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
47
+
48
+ def dynamic_range_decompression(x, C=1):
49
+ return np.exp(x) / C
50
+
51
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
52
+ return torch.log(torch.clamp(x, min=clip_val) * C)
53
+
54
+ def dynamic_range_decompression_torch(x, C=1):
55
+ return torch.exp(x) / C
56
+
57
+ class STFT():
58
+ def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
59
+ self.target_sr = sr
60
+
61
+ self.n_mels = n_mels
62
+ self.n_fft = n_fft
63
+ self.win_size = win_size
64
+ self.hop_length = hop_length
65
+ self.fmin = fmin
66
+ self.fmax = fmax
67
+ self.clip_val = clip_val
68
+ self.mel_basis = {}
69
+ self.hann_window = {}
70
+
71
+ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
72
+ sampling_rate = self.target_sr
73
+ n_mels = self.n_mels
74
+ n_fft = self.n_fft
75
+ win_size = self.win_size
76
+ hop_length = self.hop_length
77
+ fmin = self.fmin
78
+ fmax = self.fmax
79
+ clip_val = self.clip_val
80
+
81
+ factor = 2 ** (keyshift / 12)
82
+ n_fft_new = int(np.round(n_fft * factor))
83
+ win_size_new = int(np.round(win_size * factor))
84
+ hop_length_new = int(np.round(hop_length * speed))
85
+ if not train:
86
+ mel_basis = self.mel_basis
87
+ hann_window = self.hann_window
88
+ else:
89
+ mel_basis = {}
90
+ hann_window = {}
91
+
92
+ if torch.min(y) < -1.:
93
+ print('min value is ', torch.min(y))
94
+ if torch.max(y) > 1.:
95
+ print('max value is ', torch.max(y))
96
+
97
+ mel_basis_key = str(fmax)+'_'+str(y.device)
98
+ if mel_basis_key not in mel_basis:
99
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
100
+ mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
101
+
102
+ keyshift_key = str(keyshift)+'_'+str(y.device)
103
+ if keyshift_key not in hann_window:
104
+ hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
105
+
106
+ pad_left = (win_size_new - hop_length_new) //2
107
+ pad_right = max((win_size_new- hop_length_new + 1) //2, win_size_new - y.size(-1) - pad_left)
108
+ if pad_right < y.size(-1):
109
+ mode = 'reflect'
110
+ else:
111
+ mode = 'constant'
112
+ y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode = mode)
113
+ y = y.squeeze(1)
114
+
115
+ spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key],
116
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
117
+ spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
118
+ if keyshift != 0:
119
+ size = n_fft // 2 + 1
120
+ resize = spec.size(1)
121
+ if resize < size:
122
+ spec = F.pad(spec, (0, 0, 0, size-resize))
123
+ spec = spec[:, :size, :] * win_size / win_size_new
124
+ spec = torch.matmul(mel_basis[mel_basis_key], spec)
125
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
126
+ return spec
127
+
128
+ def __call__(self, audiopath):
129
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
130
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
131
+ return spect
132
+
133
+ stft = STFT()
modules/F0Predictor/fcpe/pcmer.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from functools import partial
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from einops import rearrange, repeat
7
+ from local_attention import LocalAttention
8
+ from torch import nn
9
+
10
+ #import fast_transformers.causal_product.causal_product_cuda
11
+
12
+ def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device = None):
13
+ b, h, *_ = data.shape
14
+ # (batch size, head, length, model_dim)
15
+
16
+ # normalize model dim
17
+ data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.
18
+
19
+ # what is ration?, projection_matrix.shape[0] --> 266
20
+
21
+ ratio = (projection_matrix.shape[0] ** -0.5)
22
+
23
+ projection = repeat(projection_matrix, 'j d -> b h j d', b = b, h = h)
24
+ projection = projection.type_as(data)
25
+
26
+ #data_dash = w^T x
27
+ data_dash = torch.einsum('...id,...jd->...ij', (data_normalizer * data), projection)
28
+
29
+
30
+ # diag_data = D**2
31
+ diag_data = data ** 2
32
+ diag_data = torch.sum(diag_data, dim=-1)
33
+ diag_data = (diag_data / 2.0) * (data_normalizer ** 2)
34
+ diag_data = diag_data.unsqueeze(dim=-1)
35
+
36
+ #print ()
37
+ if is_query:
38
+ data_dash = ratio * (
39
+ torch.exp(data_dash - diag_data -
40
+ torch.max(data_dash, dim=-1, keepdim=True).values) + eps)
41
+ else:
42
+ data_dash = ratio * (
43
+ torch.exp(data_dash - diag_data + eps))#- torch.max(data_dash)) + eps)
44
+
45
+ return data_dash.type_as(data)
46
+
47
+ def orthogonal_matrix_chunk(cols, qr_uniform_q = False, device = None):
48
+ unstructured_block = torch.randn((cols, cols), device = device)
49
+ q, r = torch.linalg.qr(unstructured_block.cpu(), mode='reduced')
50
+ q, r = map(lambda t: t.to(device), (q, r))
51
+
52
+ # proposed by @Parskatt
53
+ # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf
54
+ if qr_uniform_q:
55
+ d = torch.diag(r, 0)
56
+ q *= d.sign()
57
+ return q.t()
58
+ def exists(val):
59
+ return val is not None
60
+
61
+ def empty(tensor):
62
+ return tensor.numel() == 0
63
+
64
+ def default(val, d):
65
+ return val if exists(val) else d
66
+
67
+ def cast_tuple(val):
68
+ return (val,) if not isinstance(val, tuple) else val
69
+
70
+ class PCmer(nn.Module):
71
+ """The encoder that is used in the Transformer model."""
72
+
73
+ def __init__(self,
74
+ num_layers,
75
+ num_heads,
76
+ dim_model,
77
+ dim_keys,
78
+ dim_values,
79
+ residual_dropout,
80
+ attention_dropout):
81
+ super().__init__()
82
+ self.num_layers = num_layers
83
+ self.num_heads = num_heads
84
+ self.dim_model = dim_model
85
+ self.dim_values = dim_values
86
+ self.dim_keys = dim_keys
87
+ self.residual_dropout = residual_dropout
88
+ self.attention_dropout = attention_dropout
89
+
90
+ self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
91
+
92
+ # METHODS ########################################################################################################
93
+
94
+ def forward(self, phone, mask=None):
95
+
96
+ # apply all layers to the input
97
+ for (i, layer) in enumerate(self._layers):
98
+ phone = layer(phone, mask)
99
+ # provide the final sequence
100
+ return phone
101
+
102
+
103
+ # ==================================================================================================================== #
104
+ # CLASS _ E N C O D E R L A Y E R #
105
+ # ==================================================================================================================== #
106
+
107
+
108
+ class _EncoderLayer(nn.Module):
109
+ """One layer of the encoder.
110
+
111
+ Attributes:
112
+ attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence.
113
+ feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism.
114
+ """
115
+
116
+ def __init__(self, parent: PCmer):
117
+ """Creates a new instance of ``_EncoderLayer``.
118
+
119
+ Args:
120
+ parent (Encoder): The encoder that the layers is created for.
121
+ """
122
+ super().__init__()
123
+
124
+
125
+ self.conformer = ConformerConvModule(parent.dim_model)
126
+ self.norm = nn.LayerNorm(parent.dim_model)
127
+ self.dropout = nn.Dropout(parent.residual_dropout)
128
+
129
+ # selfatt -> fastatt: performer!
130
+ self.attn = SelfAttention(dim = parent.dim_model,
131
+ heads = parent.num_heads,
132
+ causal = False)
133
+
134
+ # METHODS ########################################################################################################
135
+
136
+ def forward(self, phone, mask=None):
137
+
138
+ # compute attention sub-layer
139
+ phone = phone + (self.attn(self.norm(phone), mask=mask))
140
+
141
+ phone = phone + (self.conformer(phone))
142
+
143
+ return phone
144
+
145
+ def calc_same_padding(kernel_size):
146
+ pad = kernel_size // 2
147
+ return (pad, pad - (kernel_size + 1) % 2)
148
+
149
+ # helper classes
150
+
151
+ class Swish(nn.Module):
152
+ def forward(self, x):
153
+ return x * x.sigmoid()
154
+
155
+ class Transpose(nn.Module):
156
+ def __init__(self, dims):
157
+ super().__init__()
158
+ assert len(dims) == 2, 'dims must be a tuple of two dimensions'
159
+ self.dims = dims
160
+
161
+ def forward(self, x):
162
+ return x.transpose(*self.dims)
163
+
164
+ class GLU(nn.Module):
165
+ def __init__(self, dim):
166
+ super().__init__()
167
+ self.dim = dim
168
+
169
+ def forward(self, x):
170
+ out, gate = x.chunk(2, dim=self.dim)
171
+ return out * gate.sigmoid()
172
+
173
+ class DepthWiseConv1d(nn.Module):
174
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
175
+ super().__init__()
176
+ self.padding = padding
177
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
178
+
179
+ def forward(self, x):
180
+ x = F.pad(x, self.padding)
181
+ return self.conv(x)
182
+
183
+ class ConformerConvModule(nn.Module):
184
+ def __init__(
185
+ self,
186
+ dim,
187
+ causal = False,
188
+ expansion_factor = 2,
189
+ kernel_size = 31,
190
+ dropout = 0.):
191
+ super().__init__()
192
+
193
+ inner_dim = dim * expansion_factor
194
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
195
+
196
+ self.net = nn.Sequential(
197
+ nn.LayerNorm(dim),
198
+ Transpose((1, 2)),
199
+ nn.Conv1d(dim, inner_dim * 2, 1),
200
+ GLU(dim=1),
201
+ DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
202
+ #nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
203
+ Swish(),
204
+ nn.Conv1d(inner_dim, dim, 1),
205
+ Transpose((1, 2)),
206
+ nn.Dropout(dropout)
207
+ )
208
+
209
+ def forward(self, x):
210
+ return self.net(x)
211
+
212
+ def linear_attention(q, k, v):
213
+ if v is None:
214
+ #print (k.size(), q.size())
215
+ out = torch.einsum('...ed,...nd->...ne', k, q)
216
+ return out
217
+
218
+ else:
219
+ k_cumsum = k.sum(dim = -2)
220
+ #k_cumsum = k.sum(dim = -2)
221
+ D_inv = 1. / (torch.einsum('...nd,...d->...n', q, k_cumsum.type_as(q)) + 1e-8)
222
+
223
+ context = torch.einsum('...nd,...ne->...de', k, v)
224
+ #print ("TRUEEE: ", context.size(), q.size(), D_inv.size())
225
+ out = torch.einsum('...de,...nd,...n->...ne', context, q, D_inv)
226
+ return out
227
+
228
+ def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling = 0, qr_uniform_q = False, device = None):
229
+ nb_full_blocks = int(nb_rows / nb_columns)
230
+ #print (nb_full_blocks)
231
+ block_list = []
232
+
233
+ for _ in range(nb_full_blocks):
234
+ q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device)
235
+ block_list.append(q)
236
+ # block_list[n] is a orthogonal matrix ... (model_dim * model_dim)
237
+ #print (block_list[0].size(), torch.einsum('...nd,...nd->...n', block_list[0], torch.roll(block_list[0],1,1)))
238
+ #print (nb_rows, nb_full_blocks, nb_columns)
239
+ remaining_rows = nb_rows - nb_full_blocks * nb_columns
240
+ #print (remaining_rows)
241
+ if remaining_rows > 0:
242
+ q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device)
243
+ #print (q[:remaining_rows].size())
244
+ block_list.append(q[:remaining_rows])
245
+
246
+ final_matrix = torch.cat(block_list)
247
+
248
+ if scaling == 0:
249
+ multiplier = torch.randn((nb_rows, nb_columns), device = device).norm(dim = 1)
250
+ elif scaling == 1:
251
+ multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device = device)
252
+ else:
253
+ raise ValueError(f'Invalid scaling {scaling}')
254
+
255
+ return torch.diag(multiplier) @ final_matrix
256
+
257
+ class FastAttention(nn.Module):
258
+ def __init__(self, dim_heads, nb_features = None, ortho_scaling = 0, causal = False, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, no_projection = False):
259
+ super().__init__()
260
+ nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
261
+
262
+ self.dim_heads = dim_heads
263
+ self.nb_features = nb_features
264
+ self.ortho_scaling = ortho_scaling
265
+
266
+ self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows = self.nb_features, nb_columns = dim_heads, scaling = ortho_scaling, qr_uniform_q = qr_uniform_q)
267
+ projection_matrix = self.create_projection()
268
+ self.register_buffer('projection_matrix', projection_matrix)
269
+
270
+ self.generalized_attention = generalized_attention
271
+ self.kernel_fn = kernel_fn
272
+
273
+ # if this is turned on, no projection will be used
274
+ # queries and keys will be softmax-ed as in the original efficient attention paper
275
+ self.no_projection = no_projection
276
+
277
+ self.causal = causal
278
+
279
+ @torch.no_grad()
280
+ def redraw_projection_matrix(self):
281
+ projections = self.create_projection()
282
+ self.projection_matrix.copy_(projections)
283
+ del projections
284
+
285
+ def forward(self, q, k, v):
286
+ device = q.device
287
+
288
+ if self.no_projection:
289
+ q = q.softmax(dim = -1)
290
+ k = torch.exp(k) if self.causal else k.softmax(dim = -2)
291
+ else:
292
+ create_kernel = partial(softmax_kernel, projection_matrix = self.projection_matrix, device = device)
293
+
294
+ q = create_kernel(q, is_query = True)
295
+ k = create_kernel(k, is_query = False)
296
+
297
+ attn_fn = linear_attention if not self.causal else self.causal_linear_fn
298
+ if v is None:
299
+ out = attn_fn(q, k, None)
300
+ return out
301
+ else:
302
+ out = attn_fn(q, k, v)
303
+ return out
304
+ class SelfAttention(nn.Module):
305
+ def __init__(self, dim, causal = False, heads = 8, dim_head = 64, local_heads = 0, local_window_size = 256, nb_features = None, feature_redraw_interval = 1000, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, dropout = 0., no_projection = False):
306
+ super().__init__()
307
+ assert dim % heads == 0, 'dimension must be divisible by number of heads'
308
+ dim_head = default(dim_head, dim // heads)
309
+ inner_dim = dim_head * heads
310
+ self.fast_attention = FastAttention(dim_head, nb_features, causal = causal, generalized_attention = generalized_attention, kernel_fn = kernel_fn, qr_uniform_q = qr_uniform_q, no_projection = no_projection)
311
+
312
+ self.heads = heads
313
+ self.global_heads = heads - local_heads
314
+ self.local_attn = LocalAttention(window_size = local_window_size, causal = causal, autopad = True, dropout = dropout, look_forward = int(not causal), rel_pos_emb_config = (dim_head, local_heads)) if local_heads > 0 else None
315
+
316
+ #print (heads, nb_features, dim_head)
317
+ #name_embedding = torch.zeros(110, heads, dim_head, dim_head)
318
+ #self.name_embedding = nn.Parameter(name_embedding, requires_grad=True)
319
+
320
+
321
+ self.to_q = nn.Linear(dim, inner_dim)
322
+ self.to_k = nn.Linear(dim, inner_dim)
323
+ self.to_v = nn.Linear(dim, inner_dim)
324
+ self.to_out = nn.Linear(inner_dim, dim)
325
+ self.dropout = nn.Dropout(dropout)
326
+
327
+ @torch.no_grad()
328
+ def redraw_projection_matrix(self):
329
+ self.fast_attention.redraw_projection_matrix()
330
+ #torch.nn.init.zeros_(self.name_embedding)
331
+ #print (torch.sum(self.name_embedding))
332
+ def forward(self, x, context = None, mask = None, context_mask = None, name=None, inference=False, **kwargs):
333
+ _, _, _, h, gh = *x.shape, self.heads, self.global_heads
334
+
335
+ cross_attend = exists(context)
336
+
337
+ context = default(context, x)
338
+ context_mask = default(context_mask, mask) if not cross_attend else context_mask
339
+ #print (torch.sum(self.name_embedding))
340
+ q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
341
+
342
+ q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
343
+ (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
344
+
345
+ attn_outs = []
346
+ #print (name)
347
+ #print (self.name_embedding[name].size())
348
+ if not empty(q):
349
+ if exists(context_mask):
350
+ global_mask = context_mask[:, None, :, None]
351
+ v.masked_fill_(~global_mask, 0.)
352
+ if cross_attend:
353
+ pass
354
+ #print (torch.sum(self.name_embedding))
355
+ #out = self.fast_attention(q,self.name_embedding[name],None)
356
+ #print (torch.sum(self.name_embedding[...,-1:]))
357
+ else:
358
+ out = self.fast_attention(q, k, v)
359
+ attn_outs.append(out)
360
+
361
+ if not empty(lq):
362
+ assert not cross_attend, 'local attention is not compatible with cross attention'
363
+ out = self.local_attn(lq, lk, lv, input_mask = mask)
364
+ attn_outs.append(out)
365
+
366
+ out = torch.cat(attn_outs, dim = 1)
367
+ out = rearrange(out, 'b h n d -> b n (h d)')
368
+ out = self.to_out(out)
369
+ return self.dropout(out)
modules/F0Predictor/rmvpe/inference.py CHANGED
@@ -16,7 +16,7 @@ class RMVPE:
16
  else:
17
  self.device = device
18
  model = E2E0(4, 1, (2, 2))
19
- ckpt = torch.load(model_path)
20
  model.load_state_dict(ckpt['model'])
21
  model = model.to(dtype).to(self.device)
22
  model.eval()
@@ -54,4 +54,4 @@ class RMVPE:
54
  mel = mel_extractor(audio_res, center=True).to(self.dtype)
55
  hidden = self.mel2hidden(mel)
56
  f0 = self.decode(hidden.squeeze(0), thred=thred, use_viterbi=use_viterbi)
57
- return f0
 
16
  else:
17
  self.device = device
18
  model = E2E0(4, 1, (2, 2))
19
+ ckpt = torch.load(model_path, map_location=torch.device(self.device))
20
  model.load_state_dict(ckpt['model'])
21
  model = model.to(dtype).to(self.device)
22
  model.eval()
 
54
  mel = mel_extractor(audio_res, center=True).to(self.dtype)
55
  hidden = self.mel2hidden(mel)
56
  f0 = self.decode(hidden.squeeze(0), thred=thred, use_viterbi=use_viterbi)
57
+ return f0
utils.py CHANGED
@@ -43,7 +43,6 @@ def normalize_f0(f0, x_mask, uv, random_scale=True):
43
  if torch.isnan(f0_norm).any():
44
  exit(0)
45
  return f0_norm * x_mask
46
-
47
  def plot_data_to_numpy(x, y):
48
  global MATPLOTLIB_FLAG
49
  if not MATPLOTLIB_FLAG:
@@ -68,14 +67,16 @@ def plot_data_to_numpy(x, y):
68
 
69
 
70
  def f0_to_coarse(f0):
71
- is_torch = isinstance(f0, torch.Tensor)
72
- f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
73
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
74
-
75
- f0_mel[f0_mel <= 1] = 1
76
- f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
77
- f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int)
78
- assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
 
 
79
  return f0_coarse
80
 
81
  def get_content(cmodel, y):
@@ -100,6 +101,9 @@ def get_f0_predictor(f0_predictor,hop_length,sampling_rate,**kargs):
100
  elif f0_predictor == "rmvpe":
101
  from modules.F0Predictor.RMVPEF0Predictor import RMVPEF0Predictor
102
  f0_predictor_object = RMVPEF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"])
 
 
 
103
  else:
104
  raise Exception("Unknown f0 predictor")
105
  return f0_predictor_object
@@ -170,7 +174,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
170
  assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
171
  except Exception:
172
  if "enc_q" not in k or "emb_g" not in k:
173
- print("error, %s is not in the checkpoint" % k)
174
  logger.info("%s is not in the checkpoint" % k)
175
  new_state_dict[k] = v
176
  if hasattr(model, 'module'):
 
43
  if torch.isnan(f0_norm).any():
44
  exit(0)
45
  return f0_norm * x_mask
 
46
  def plot_data_to_numpy(x, y):
47
  global MATPLOTLIB_FLAG
48
  if not MATPLOTLIB_FLAG:
 
67
 
68
 
69
  def f0_to_coarse(f0):
70
+ f0_mel = 1127 * (1 + f0 / 700).log()
71
+ a = (f0_bin - 2) / (f0_mel_max - f0_mel_min)
72
+ b = f0_mel_min * a - 1.
73
+ f0_mel = torch.where(f0_mel > 0, f0_mel * a - b, f0_mel)
74
+ # torch.clip_(f0_mel, min=1., max=float(f0_bin - 1))
75
+ f0_coarse = torch.round(f0_mel).long()
76
+ f0_coarse = f0_coarse * (f0_coarse > 0)
77
+ f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
78
+ f0_coarse = f0_coarse * (f0_coarse < f0_bin)
79
+ f0_coarse = f0_coarse + ((f0_coarse >= f0_bin) * (f0_bin - 1))
80
  return f0_coarse
81
 
82
  def get_content(cmodel, y):
 
101
  elif f0_predictor == "rmvpe":
102
  from modules.F0Predictor.RMVPEF0Predictor import RMVPEF0Predictor
103
  f0_predictor_object = RMVPEF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"])
104
+ elif f0_predictor == "fcpe":
105
+ from modules.F0Predictor.FCPEF0Predictor import FCPEF0Predictor
106
+ f0_predictor_object = FCPEF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"])
107
  else:
108
  raise Exception("Unknown f0 predictor")
109
  return f0_predictor_object
 
174
  assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
175
  except Exception:
176
  if "enc_q" not in k or "emb_g" not in k:
177
+ print("%s is not in the checkpoint,please check your checkpoint.If you're using pretrain model,just ignore this warning." % k)
178
  logger.info("%s is not in the checkpoint" % k)
179
  new_state_dict[k] = v
180
  if hasattr(model, 'module'):
vdecoder/hifigan/models.py CHANGED
@@ -128,6 +128,7 @@ class SineGen(torch.nn.Module):
128
  self.sampling_rate = samp_rate
129
  self.voiced_threshold = voiced_threshold
130
  self.flag_for_pulse = flag_for_pulse
 
131
 
132
  def _f02uv(self, f0):
133
  # generate uv signal
@@ -193,35 +194,81 @@ class SineGen(torch.nn.Module):
193
  sines = torch.cos(i_phase * 2 * np.pi)
194
  return sines
195
 
196
- def forward(self, f0):
197
  """ sine_tensor, uv = forward(f0)
198
  input F0: tensor(batchsize=1, length, dim=1)
199
  f0 for unvoiced steps should be 0
200
  output sine_tensor: tensor(batchsize=1, length, dim)
201
  output uv: tensor(batchsize=1, length, 1)
202
  """
203
- with torch.no_grad():
204
- # fundamental component
205
- fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- # generate sine waveforms
208
- sine_waves = self._f02sine(fn) * self.sine_amp
209
 
210
- # generate uv signal
211
- # uv = torch.ones(f0.shape)
212
- # uv = uv * (f0 > self.voiced_threshold)
213
- uv = self._f02uv(f0)
214
 
215
- # noise: for unvoiced should be similar to sine_amp
216
- # std = self.sine_amp/3 -> max value ~ self.sine_amp
217
- # . for voiced regions is self.noise_std
218
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
219
- noise = noise_amp * torch.randn_like(sine_waves)
220
 
221
- # first: set the unvoiced part to 0 by uv
222
- # then: additive noise
223
- sine_waves = sine_waves * uv + noise
224
- return sine_waves, uv, noise
225
 
226
 
227
  class SourceModuleHnNSF(torch.nn.Module):
@@ -257,7 +304,7 @@ class SourceModuleHnNSF(torch.nn.Module):
257
  self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
258
  self.l_tanh = torch.nn.Tanh()
259
 
260
- def forward(self, x):
261
  """
262
  Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
263
  F0_sampled (batchsize, length, 1)
@@ -265,7 +312,7 @@ class SourceModuleHnNSF(torch.nn.Module):
265
  noise_source (batchsize, length 1)
266
  """
267
  # source for harmonic branch
268
- sine_wavs, uv, _ = self.l_sin_gen(x)
269
  sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
270
 
271
  # source for noise branch, in the same shape as uv
@@ -309,12 +356,19 @@ class Generator(torch.nn.Module):
309
  self.ups.apply(init_weights)
310
  self.conv_post.apply(init_weights)
311
  self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
 
 
 
 
 
 
312
 
313
  def forward(self, x, f0, g=None):
314
  # print(1,x.shape,f0.shape,f0[:, None].shape)
315
- f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
 
316
  # print(2,f0.shape)
317
- har_source, noi_source, uv = self.m_source(f0)
318
  har_source = har_source.transpose(1, 2)
319
  x = self.conv_pre(x)
320
  x = x + self.cond(g)
 
128
  self.sampling_rate = samp_rate
129
  self.voiced_threshold = voiced_threshold
130
  self.flag_for_pulse = flag_for_pulse
131
+ self.onnx = False
132
 
133
  def _f02uv(self, f0):
134
  # generate uv signal
 
194
  sines = torch.cos(i_phase * 2 * np.pi)
195
  return sines
196
 
197
+ def forward(self, f0, upp=None):
198
  """ sine_tensor, uv = forward(f0)
199
  input F0: tensor(batchsize=1, length, dim=1)
200
  f0 for unvoiced steps should be 0
201
  output sine_tensor: tensor(batchsize=1, length, dim)
202
  output uv: tensor(batchsize=1, length, 1)
203
  """
204
+ if self.onnx:
205
+ with torch.no_grad():
206
+ f0 = f0[:, None].transpose(1, 2)
207
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
208
+ # fundamental component
209
+ f0_buf[:, :, 0] = f0[:, :, 0]
210
+ for idx in np.arange(self.harmonic_num):
211
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
212
+ idx + 2
213
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
214
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
215
+ rand_ini = torch.rand(
216
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
217
+ )
218
+ rand_ini[:, 0] = 0
219
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
220
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
221
+ tmp_over_one *= upp
222
+ tmp_over_one = F.interpolate(
223
+ tmp_over_one.transpose(2, 1),
224
+ scale_factor=upp,
225
+ mode="linear",
226
+ align_corners=True,
227
+ ).transpose(2, 1)
228
+ rad_values = F.interpolate(
229
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
230
+ ).transpose(
231
+ 2, 1
232
+ ) #######
233
+ tmp_over_one %= 1
234
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
235
+ cumsum_shift = torch.zeros_like(rad_values)
236
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
237
+ sine_waves = torch.sin(
238
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
239
+ )
240
+ sine_waves = sine_waves * self.sine_amp
241
+ uv = self._f02uv(f0)
242
+ uv = F.interpolate(
243
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
244
+ ).transpose(2, 1)
245
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
246
+ noise = noise_amp * torch.randn_like(sine_waves)
247
+ sine_waves = sine_waves * uv + noise
248
+ return sine_waves, uv, noise
249
+ else:
250
+ with torch.no_grad():
251
+ # fundamental component
252
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
253
 
254
+ # generate sine waveforms
255
+ sine_waves = self._f02sine(fn) * self.sine_amp
256
 
257
+ # generate uv signal
258
+ # uv = torch.ones(f0.shape)
259
+ # uv = uv * (f0 > self.voiced_threshold)
260
+ uv = self._f02uv(f0)
261
 
262
+ # noise: for unvoiced should be similar to sine_amp
263
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
264
+ # . for voiced regions is self.noise_std
265
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
266
+ noise = noise_amp * torch.randn_like(sine_waves)
267
 
268
+ # first: set the unvoiced part to 0 by uv
269
+ # then: additive noise
270
+ sine_waves = sine_waves * uv + noise
271
+ return sine_waves, uv, noise
272
 
273
 
274
  class SourceModuleHnNSF(torch.nn.Module):
 
304
  self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
305
  self.l_tanh = torch.nn.Tanh()
306
 
307
+ def forward(self, x, upp=None):
308
  """
309
  Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
310
  F0_sampled (batchsize, length, 1)
 
312
  noise_source (batchsize, length 1)
313
  """
314
  # source for harmonic branch
315
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
316
  sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
317
 
318
  # source for noise branch, in the same shape as uv
 
356
  self.ups.apply(init_weights)
357
  self.conv_post.apply(init_weights)
358
  self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
359
+ self.upp = np.prod(h["upsample_rates"])
360
+ self.onnx = False
361
+
362
+ def OnnxExport(self):
363
+ self.onnx = True
364
+ self.m_source.l_sin_gen.onnx = True
365
 
366
  def forward(self, x, f0, g=None):
367
  # print(1,x.shape,f0.shape,f0[:, None].shape)
368
+ if not self.onnx:
369
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
370
  # print(2,f0.shape)
371
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
372
  har_source = har_source.transpose(1, 2)
373
  x = self.conv_pre(x)
374
  x = x + self.cond(g)
vdecoder/hifiganwithsnake/models.py CHANGED
@@ -141,6 +141,7 @@ class SineGen(torch.nn.Module):
141
  self.sampling_rate = samp_rate
142
  self.voiced_threshold = voiced_threshold
143
  self.flag_for_pulse = flag_for_pulse
 
144
 
145
  def _f02uv(self, f0):
146
  # generate uv signal
@@ -206,35 +207,82 @@ class SineGen(torch.nn.Module):
206
  sines = torch.cos(i_phase * 2 * np.pi)
207
  return sines
208
 
209
- def forward(self, f0):
210
  """ sine_tensor, uv = forward(f0)
211
  input F0: tensor(batchsize=1, length, dim=1)
212
  f0 for unvoiced steps should be 0
213
  output sine_tensor: tensor(batchsize=1, length, dim)
214
  output uv: tensor(batchsize=1, length, 1)
215
  """
216
- with torch.no_grad():
217
- # fundamental component
218
- fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- # generate sine waveforms
221
- sine_waves = self._f02sine(fn) * self.sine_amp
222
 
223
- # generate uv signal
224
- # uv = torch.ones(f0.shape)
225
- # uv = uv * (f0 > self.voiced_threshold)
226
- uv = self._f02uv(f0)
227
 
228
- # noise: for unvoiced should be similar to sine_amp
229
- # std = self.sine_amp/3 -> max value ~ self.sine_amp
230
- # . for voiced regions is self.noise_std
231
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
232
- noise = noise_amp * torch.randn_like(sine_waves)
233
 
234
- # first: set the unvoiced part to 0 by uv
235
- # then: additive noise
236
- sine_waves = sine_waves * uv + noise
237
- return sine_waves, uv, noise
238
 
239
 
240
  class SourceModuleHnNSF(torch.nn.Module):
@@ -270,7 +318,7 @@ class SourceModuleHnNSF(torch.nn.Module):
270
  self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
271
  self.l_tanh = torch.nn.Tanh()
272
 
273
- def forward(self, x):
274
  """
275
  Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
276
  F0_sampled (batchsize, length, 1)
@@ -278,7 +326,7 @@ class SourceModuleHnNSF(torch.nn.Module):
278
  noise_source (batchsize, length 1)
279
  """
280
  # source for harmonic branch
281
- sine_wavs, uv, _ = self.l_sin_gen(x)
282
  sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
283
 
284
  # source for noise branch, in the same shape as uv
@@ -325,12 +373,19 @@ class Generator(torch.nn.Module):
325
  self.conv_post.apply(init_weights)
326
  self.snake_post = SnakeAlias(ch, C = h["upsample_initial_channel"] >> len(self.ups))
327
  self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
 
 
 
 
 
 
328
 
329
  def forward(self, x, f0, g=None):
330
  # print(1,x.shape,f0.shape,f0[:, None].shape)
331
- f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
 
332
  # print(2,f0.shape)
333
- har_source, noi_source, uv = self.m_source(f0)
334
  har_source = har_source.transpose(1, 2)
335
  x = self.conv_pre(x)
336
  x = x + self.cond(g)
 
141
  self.sampling_rate = samp_rate
142
  self.voiced_threshold = voiced_threshold
143
  self.flag_for_pulse = flag_for_pulse
144
+ self.onnx = False
145
 
146
  def _f02uv(self, f0):
147
  # generate uv signal
 
207
  sines = torch.cos(i_phase * 2 * np.pi)
208
  return sines
209
 
210
+ def forward(self, f0, upp=None):
211
  """ sine_tensor, uv = forward(f0)
212
  input F0: tensor(batchsize=1, length, dim=1)
213
  f0 for unvoiced steps should be 0
214
  output sine_tensor: tensor(batchsize=1, length, dim)
215
  output uv: tensor(batchsize=1, length, 1)
216
  """
217
+
218
+ if self.onnx:
219
+ with torch.no_grad():
220
+ f0 = f0[:, None].transpose(1, 2)
221
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
222
+ # fundamental component
223
+ f0_buf[:, :, 0] = f0[:, :, 0]
224
+ for idx in np.arange(self.harmonic_num):
225
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
226
+ idx + 2
227
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
228
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
229
+ rand_ini = torch.rand(
230
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
231
+ )
232
+ rand_ini[:, 0] = 0
233
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
234
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
235
+ tmp_over_one *= upp
236
+ tmp_over_one = F.interpolate(
237
+ tmp_over_one.transpose(2, 1),
238
+ scale_factor=upp,
239
+ mode="linear",
240
+ align_corners=True,
241
+ ).transpose(2, 1)
242
+ rad_values = F.interpolate(
243
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
244
+ ).transpose(
245
+ 2, 1
246
+ ) #######
247
+ tmp_over_one %= 1
248
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
249
+ cumsum_shift = torch.zeros_like(rad_values)
250
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
251
+ sine_waves = torch.sin(
252
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
253
+ )
254
+ sine_waves = sine_waves * self.sine_amp
255
+ uv = self._f02uv(f0)
256
+ uv = F.interpolate(
257
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
258
+ ).transpose(2, 1)
259
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
260
+ noise = noise_amp * torch.randn_like(sine_waves)
261
+ sine_waves = sine_waves * uv + noise
262
+ return sine_waves, uv, noise
263
+ else:
264
+ with torch.no_grad():
265
+ # fundamental component
266
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
267
 
268
+ # generate sine waveforms
269
+ sine_waves = self._f02sine(fn) * self.sine_amp
270
 
271
+ # generate uv signal
272
+ # uv = torch.ones(f0.shape)
273
+ # uv = uv * (f0 > self.voiced_threshold)
274
+ uv = self._f02uv(f0)
275
 
276
+ # noise: for unvoiced should be similar to sine_amp
277
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
278
+ # . for voiced regions is self.noise_std
279
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
280
+ noise = noise_amp * torch.randn_like(sine_waves)
281
 
282
+ # first: set the unvoiced part to 0 by uv
283
+ # then: additive noise
284
+ sine_waves = sine_waves * uv + noise
285
+ return sine_waves, uv, noise
286
 
287
 
288
  class SourceModuleHnNSF(torch.nn.Module):
 
318
  self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
319
  self.l_tanh = torch.nn.Tanh()
320
 
321
+ def forward(self, x, upp=None):
322
  """
323
  Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
324
  F0_sampled (batchsize, length, 1)
 
326
  noise_source (batchsize, length 1)
327
  """
328
  # source for harmonic branch
329
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
330
  sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
331
 
332
  # source for noise branch, in the same shape as uv
 
373
  self.conv_post.apply(init_weights)
374
  self.snake_post = SnakeAlias(ch, C = h["upsample_initial_channel"] >> len(self.ups))
375
  self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
376
+ self.upp = np.prod(h["upsample_rates"])
377
+ self.onnx = False
378
+
379
+ def OnnxExport(self):
380
+ self.onnx = True
381
+ self.m_source.l_sin_gen.onnx = True
382
 
383
  def forward(self, x, f0, g=None):
384
  # print(1,x.shape,f0.shape,f0[:, None].shape)
385
+ if not self.onnx:
386
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
387
  # print(2,f0.shape)
388
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
389
  har_source = har_source.transpose(1, 2)
390
  x = self.conv_pre(x)
391
  x = x + self.cond(g)