darksakura commited on
Commit
cb73098
1 Parent(s): c96445d

Upload 4 files

Browse files
Files changed (4) hide show
  1. data_utils.py +9 -9
  2. inference_main.py +92 -74
  3. models.py +56 -32
  4. train.py +35 -37
data_utils.py CHANGED
@@ -1,14 +1,13 @@
1
- import time
2
  import os
3
  import random
 
4
  import numpy as np
5
  import torch
6
  import torch.utils.data
7
 
8
- import modules.commons as commons
9
  import utils
10
- from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch
11
- from utils import load_wav_to_torch, load_filepaths_and_text
12
 
13
  # import h5py
14
 
@@ -31,6 +30,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
31
  self.filter_length = hparams.data.filter_length
32
  self.hop_length = hparams.data.hop_length
33
  self.win_length = hparams.data.win_length
 
34
  self.sampling_rate = hparams.data.sampling_rate
35
  self.use_sr = hparams.train.use_sr
36
  self.spec_len = hparams.train.max_speclen
@@ -73,7 +73,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
73
  uv = torch.FloatTensor(np.array(uv,dtype=float))
74
 
75
  c = torch.load(filename+ ".soft.pt")
76
- c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
77
  if self.vol_emb:
78
  volume_path = filename + ".vol.npy"
79
  volume = np.load(volume_path)
@@ -86,7 +86,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
86
  assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
87
  spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
88
  audio_norm = audio_norm[:, :lmin * self.hop_length]
89
- if volume!= None:
90
  volume = volume[:lmin]
91
  return c, f0, spec, audio_norm, spk, uv, volume
92
 
@@ -95,7 +95,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
95
  # print("skip too short audio:", filename)
96
  # return None
97
 
98
- if random.choice([True, False]) and self.vol_aug and volume!=None:
99
  max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
100
  max_shift = min(1, np.log10(1/max_amp))
101
  log10_vol_shift = random.uniform(-1, max_shift)
@@ -113,7 +113,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
113
  end = start + 790
114
  spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
115
  audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
116
- if volume !=None:
117
  volume = volume[start:end]
118
  return c, f0, spec, audio_norm, spk, uv,volume
119
 
@@ -177,7 +177,7 @@ class TextAudioCollate:
177
  uv = row[5]
178
  uv_padded[i, :uv.size(0)] = uv
179
  volume = row[6]
180
- if volume != None:
181
  volume_padded[i, :volume.size(0)] = volume
182
  else :
183
  volume_padded = None
 
 
1
  import os
2
  import random
3
+
4
  import numpy as np
5
  import torch
6
  import torch.utils.data
7
 
 
8
  import utils
9
+ from modules.mel_processing import spectrogram_torch
10
+ from utils import load_filepaths_and_text, load_wav_to_torch
11
 
12
  # import h5py
13
 
 
30
  self.filter_length = hparams.data.filter_length
31
  self.hop_length = hparams.data.hop_length
32
  self.win_length = hparams.data.win_length
33
+ self.unit_interpolate_mode = hparams.data.unit_interpolate_mode
34
  self.sampling_rate = hparams.data.sampling_rate
35
  self.use_sr = hparams.train.use_sr
36
  self.spec_len = hparams.train.max_speclen
 
73
  uv = torch.FloatTensor(np.array(uv,dtype=float))
74
 
75
  c = torch.load(filename+ ".soft.pt")
76
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0], mode=self.unit_interpolate_mode)
77
  if self.vol_emb:
78
  volume_path = filename + ".vol.npy"
79
  volume = np.load(volume_path)
 
86
  assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
87
  spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
88
  audio_norm = audio_norm[:, :lmin * self.hop_length]
89
+ if volume is not None:
90
  volume = volume[:lmin]
91
  return c, f0, spec, audio_norm, spk, uv, volume
92
 
 
95
  # print("skip too short audio:", filename)
96
  # return None
97
 
98
+ if random.choice([True, False]) and self.vol_aug and volume is not None:
99
  max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
100
  max_shift = min(1, np.log10(1/max_amp))
101
  log10_vol_shift = random.uniform(-1, max_shift)
 
113
  end = start + 790
114
  spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
115
  audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
116
+ if volume is not None:
117
  volume = volume[start:end]
118
  return c, f0, spec, audio_norm, spk, uv,volume
119
 
 
177
  uv = row[5]
178
  uv_padded[i, :uv.size(0)] = uv
179
  volume = row[6]
180
+ if volume is not None:
181
  volume_padded[i, :volume.size(0)] = volume
182
  else :
183
  volume_padded = None
inference_main.py CHANGED
@@ -1,16 +1,10 @@
1
- import io
2
  import logging
3
- import time
4
- from pathlib import Path
5
 
6
- import librosa
7
- import matplotlib.pyplot as plt
8
- import numpy as np
9
  import soundfile
10
 
11
  from inference import infer_tool
12
- from inference import slicer
13
  from inference.infer_tool import Svc
 
14
 
15
  logging.getLogger('numba').setLevel(logging.WARNING)
16
  chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
@@ -23,21 +17,33 @@ def main():
23
  parser = argparse.ArgumentParser(description='sovits4 inference')
24
 
25
  # 一定要设置的部分
26
- parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
27
- parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
28
  parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
29
  parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
30
  parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
31
- parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
32
-
33
  # 可选项部分
34
- parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
35
- parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
36
- parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比,范围0-1,若没有训练聚类模型则默认0即可')
37
  parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
38
- parser.add_argument('-fmp', '--f0_mean_pooling', type=bool, default=False, help='是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭')
39
- parser.add_argument('-eh', '--enhance', type=bool, default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
40
-
 
 
 
 
 
 
 
 
 
 
 
 
41
  # 不用动的部分
42
  parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
43
  parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
@@ -46,7 +52,9 @@ def main():
46
  parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
47
  parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
48
  parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')
49
-
 
 
50
  args = parser.parse_args()
51
 
52
  clean_names = args.clean_names
@@ -61,75 +69,85 @@ def main():
61
  clip = args.clip
62
  lg = args.linear_gradient
63
  lgr = args.linear_gradient_retain
64
- F0_mean_pooling = args.f0_mean_pooling
65
  enhance = args.enhance
66
  enhancer_adaptive_key = args.enhancer_adaptive_key
 
 
 
 
 
 
 
 
 
67
 
68
- svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance)
69
- infer_tool.mkdir(["raw", "results"])
 
 
 
 
 
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  infer_tool.fill_a_to_b(trans, clean_names)
72
  for clean_name, tran in zip(clean_names, trans):
73
  raw_audio_path = f"raw/{clean_name}"
74
  if "." not in raw_audio_path:
75
  raw_audio_path += ".wav"
76
  infer_tool.format_wav(raw_audio_path)
77
- wav_path = Path(raw_audio_path).with_suffix('.wav')
78
- chunks = slicer.cut(wav_path, db_thresh=slice_db)
79
- audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
80
- per_size = int(clip*audio_sr)
81
- lg_size = int(lg*audio_sr)
82
- lg_size_r = int(lg_size*lgr)
83
- lg_size_c_l = (lg_size-lg_size_r)//2
84
- lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
85
- lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
86
-
87
  for spk in spk_list:
88
- audio = []
89
- for (slice_tag, data) in audio_data:
90
- print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
91
-
92
- length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
93
- if slice_tag:
94
- print('jump empty segment')
95
- _audio = np.zeros(length)
96
- audio.extend(list(infer_tool.pad_array(_audio, length)))
97
- continue
98
- if per_size != 0:
99
- datas = infer_tool.split_list_by_n(data, per_size,lg_size)
100
- else:
101
- datas = [data]
102
- for k,dat in enumerate(datas):
103
- per_length = int(np.ceil(len(dat) / audio_sr * svc_model.target_sample)) if clip!=0 else length
104
- if clip!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
105
- # padd
106
- pad_len = int(audio_sr * pad_seconds)
107
- dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
108
- raw_path = io.BytesIO()
109
- soundfile.write(raw_path, dat, audio_sr, format="wav")
110
- raw_path.seek(0)
111
- out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
112
- cluster_infer_ratio=cluster_infer_ratio,
113
- auto_predict_f0=auto_predict_f0,
114
- noice_scale=noice_scale,
115
- F0_mean_pooling = F0_mean_pooling,
116
- enhancer_adaptive_key = enhancer_adaptive_key
117
- )
118
- _audio = out_audio.cpu().numpy()
119
- pad_len = int(svc_model.target_sample * pad_seconds)
120
- _audio = _audio[pad_len:-pad_len]
121
- _audio = infer_tool.pad_array(_audio, per_length)
122
- if lg_size!=0 and k!=0:
123
- lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
124
- lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
125
- lg_pre = lg1*(1-lg)+lg2*lg
126
- audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
127
- audio.extend(lg_pre)
128
- _audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
129
- audio.extend(list(_audio))
130
  key = "auto" if auto_predict_f0 else f"{tran}key"
131
  cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
132
- res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'
 
 
 
 
 
 
 
133
  soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
134
  svc_model.clear_empty()
135
 
 
 
1
  import logging
 
 
2
 
 
 
 
3
  import soundfile
4
 
5
  from inference import infer_tool
 
6
  from inference.infer_tool import Svc
7
+ from spkmix import spk_mix_map
8
 
9
  logging.getLogger('numba').setLevel(logging.WARNING)
10
  chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
 
17
  parser = argparse.ArgumentParser(description='sovits4 inference')
18
 
19
  # 一定要设置的部分
20
+ parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_37600.pth", help='模型路径')
21
+ parser.add_argument('-c', '--config_path', type=str, default="logs/44k/config.json", help='配置文件路径')
22
  parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
23
  parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
24
  parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
25
+ parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['buyizi'], help='合成目标说话人名称')
26
+
27
  # 可选项部分
28
+ parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
29
+ parser.add_argument('-cm', '--cluster_model_path', type=str, default="", help='聚类模型或特征检索索引路径,留空则自动设为各方案模型的默认路径,如果没有训练聚类或特征检索则随便填')
30
+ parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可')
31
  parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
32
+ parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,rmvpe,默认为pm(注意:crepe为原F0使用均值滤波器)')
33
+ parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
34
+ parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
35
+ parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='是否使用角色融合')
36
+ parser.add_argument('-lea', '--loudness_envelope_adjustment', type=float, default=1, help='输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络')
37
+ parser.add_argument('-fr', '--feature_retrieval', action='store_true', default=False, help='是否使用特征检索,如果使用聚类模型将被禁用,且cm与cr参数将会变成特征检索的索引路径与混合比例')
38
+
39
+ # 浅扩散设置
40
+ parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
41
+ parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='扩散模型配置文件路径')
42
+ parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数,越大越接近扩散模型的结果,默认100')
43
+ parser.add_argument('-se', '--second_encoding', action='store_true', default=False, help='二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,有时候效果好,有时候效果差')
44
+ parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='纯扩散模式,该模式不会加载sovits模型,以扩散模型推理')
45
+
46
+
47
  # 不用动的部分
48
  parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
49
  parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
 
52
  parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
53
  parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
54
  parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')
55
+ parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,help='F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音')
56
+
57
+
58
  args = parser.parse_args()
59
 
60
  clean_names = args.clean_names
 
69
  clip = args.clip
70
  lg = args.linear_gradient
71
  lgr = args.linear_gradient_retain
72
+ f0p = args.f0_predictor
73
  enhance = args.enhance
74
  enhancer_adaptive_key = args.enhancer_adaptive_key
75
+ cr_threshold = args.f0_filter_threshold
76
+ diffusion_model_path = args.diffusion_model_path
77
+ diffusion_config_path = args.diffusion_config_path
78
+ k_step = args.k_step
79
+ only_diffusion = args.only_diffusion
80
+ shallow_diffusion = args.shallow_diffusion
81
+ use_spk_mix = args.use_spk_mix
82
+ second_encoding = args.second_encoding
83
+ loudness_envelope_adjustment = args.loudness_envelope_adjustment
84
 
85
+ if cluster_infer_ratio != 0:
86
+ if args.cluster_model_path == "":
87
+ if args.feature_retrieval: # 若指定了占比但没有指定模型路径,则按是否使用特征检索分配默认的模型路径
88
+ args.cluster_model_path = "logs/44k/feature_and_index.pkl"
89
+ else:
90
+ args.cluster_model_path = "logs/44k/kmeans_10000.pt"
91
+ else: # 若未指定占比,则无论是否指定模型路径,都将其置空以避免之后的模型加载
92
+ args.cluster_model_path = ""
93
 
94
+ svc_model = Svc(args.model_path,
95
+ args.config_path,
96
+ args.device,
97
+ args.cluster_model_path,
98
+ enhance,
99
+ diffusion_model_path,
100
+ diffusion_config_path,
101
+ shallow_diffusion,
102
+ only_diffusion,
103
+ use_spk_mix,
104
+ args.feature_retrieval)
105
+
106
+ infer_tool.mkdir(["raw", "results"])
107
+
108
+ if len(spk_mix_map)<=1:
109
+ use_spk_mix = False
110
+ if use_spk_mix:
111
+ spk_list = [spk_mix_map]
112
+
113
  infer_tool.fill_a_to_b(trans, clean_names)
114
  for clean_name, tran in zip(clean_names, trans):
115
  raw_audio_path = f"raw/{clean_name}"
116
  if "." not in raw_audio_path:
117
  raw_audio_path += ".wav"
118
  infer_tool.format_wav(raw_audio_path)
 
 
 
 
 
 
 
 
 
 
119
  for spk in spk_list:
120
+ kwarg = {
121
+ "raw_audio_path" : raw_audio_path,
122
+ "spk" : spk,
123
+ "tran" : tran,
124
+ "slice_db" : slice_db,
125
+ "cluster_infer_ratio" : cluster_infer_ratio,
126
+ "auto_predict_f0" : auto_predict_f0,
127
+ "noice_scale" : noice_scale,
128
+ "pad_seconds" : pad_seconds,
129
+ "clip_seconds" : clip,
130
+ "lg_num": lg,
131
+ "lgr_num" : lgr,
132
+ "f0_predictor" : f0p,
133
+ "enhancer_adaptive_key" : enhancer_adaptive_key,
134
+ "cr_threshold" : cr_threshold,
135
+ "k_step":k_step,
136
+ "use_spk_mix":use_spk_mix,
137
+ "second_encoding":second_encoding,
138
+ "loudness_envelope_adjustment":loudness_envelope_adjustment
139
+ }
140
+ audio = svc_model.slice_inference(**kwarg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  key = "auto" if auto_predict_f0 else f"{tran}key"
142
  cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
143
+ isdiffusion = "sovits"
144
+ if shallow_diffusion :
145
+ isdiffusion = "sovdiff"
146
+ if only_diffusion :
147
+ isdiffusion = "diff"
148
+ if use_spk_mix:
149
+ spk = "spk_mix"
150
+ res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}_{f0p}.{wav_format}'
151
  soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
152
  svc_model.clear_empty()
153
 
models.py CHANGED
@@ -1,21 +1,17 @@
1
- import copy
2
- import math
3
  import torch
4
  from torch import nn
 
5
  from torch.nn import functional as F
 
6
 
7
  import modules.attentions as attentions
8
  import modules.commons as commons
9
  import modules.modules as modules
10
-
11
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
12
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
-
14
  import utils
15
- from modules.commons import init_weights, get_padding
16
- from vdecoder.hifigan.models import Generator
17
  from utils import f0_to_coarse
18
 
 
19
  class ResidualCouplingBlock(nn.Module):
20
  def __init__(self,
21
  channels,
@@ -126,7 +122,7 @@ class DiscriminatorP(torch.nn.Module):
126
  super(DiscriminatorP, self).__init__()
127
  self.period = period
128
  self.use_spectral_norm = use_spectral_norm
129
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
130
  self.convs = nn.ModuleList([
131
  norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
132
  norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
@@ -161,7 +157,7 @@ class DiscriminatorP(torch.nn.Module):
161
  class DiscriminatorS(torch.nn.Module):
162
  def __init__(self, use_spectral_norm=False):
163
  super(DiscriminatorS, self).__init__()
164
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
165
  self.convs = nn.ModuleList([
166
  norm_f(Conv1d(1, 16, 15, 1, padding=7)),
167
  norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
@@ -321,6 +317,10 @@ class SynthesizerTrn(nn.Module):
321
  n_speakers,
322
  sampling_rate=44100,
323
  vol_embedding=False,
 
 
 
 
324
  **kwargs):
325
 
326
  super().__init__()
@@ -343,6 +343,8 @@ class SynthesizerTrn(nn.Module):
343
  self.ssl_dim = ssl_dim
344
  self.vol_embedding = vol_embedding
345
  self.emb_g = nn.Embedding(n_speakers, gin_channels)
 
 
346
  if vol_embedding:
347
  self.emb_vol = nn.Linear(1, hidden_channels)
348
 
@@ -367,20 +369,35 @@ class SynthesizerTrn(nn.Module):
367
  "upsample_initial_channel": upsample_initial_channel,
368
  "upsample_kernel_sizes": upsample_kernel_sizes,
369
  "gin_channels": gin_channels,
 
370
  }
371
- self.dec = Generator(h=hps)
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
373
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
374
- self.f0_decoder = F0Decoder(
375
- 1,
376
- hidden_channels,
377
- filter_channels,
378
- n_heads,
379
- n_layers,
380
- kernel_size,
381
- p_dropout,
382
- spk_channels=gin_channels
383
- )
 
384
  self.emb_uv = nn.Embedding(2, hidden_channels)
385
  self.character_mix = False
386
 
@@ -395,17 +412,21 @@ class SynthesizerTrn(nn.Module):
395
  g = self.emb_g(g).transpose(1,2)
396
 
397
  # vol proj
398
- vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
399
 
400
  # ssl prenet
401
  x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
402
  x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
403
-
404
  # f0 predict
405
- lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
406
- norm_lf0 = utils.normalize_f0(lf0, x_mask, uv)
407
- pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
408
-
 
 
 
 
409
  # encoder
410
  z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
411
  z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
@@ -419,6 +440,7 @@ class SynthesizerTrn(nn.Module):
419
 
420
  return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
421
 
 
422
  def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False, vol = None):
423
 
424
  if c.device == torch.device("cuda"):
@@ -440,11 +462,13 @@ class SynthesizerTrn(nn.Module):
440
 
441
  x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
442
  # vol proj
443
- vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
444
-
445
- x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
446
 
447
- if predict_f0:
 
 
 
 
 
448
  lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
449
  norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
450
  pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
 
 
 
1
  import torch
2
  from torch import nn
3
+ from torch.nn import Conv1d, Conv2d
4
  from torch.nn import functional as F
5
+ from torch.nn.utils import spectral_norm, weight_norm
6
 
7
  import modules.attentions as attentions
8
  import modules.commons as commons
9
  import modules.modules as modules
 
 
 
 
10
  import utils
11
+ from modules.commons import get_padding
 
12
  from utils import f0_to_coarse
13
 
14
+
15
  class ResidualCouplingBlock(nn.Module):
16
  def __init__(self,
17
  channels,
 
122
  super(DiscriminatorP, self).__init__()
123
  self.period = period
124
  self.use_spectral_norm = use_spectral_norm
125
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
126
  self.convs = nn.ModuleList([
127
  norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
128
  norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 
157
  class DiscriminatorS(torch.nn.Module):
158
  def __init__(self, use_spectral_norm=False):
159
  super(DiscriminatorS, self).__init__()
160
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
161
  self.convs = nn.ModuleList([
162
  norm_f(Conv1d(1, 16, 15, 1, padding=7)),
163
  norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
 
317
  n_speakers,
318
  sampling_rate=44100,
319
  vol_embedding=False,
320
+ vocoder_name = "nsf-hifigan",
321
+ use_depthwise_conv = False,
322
+ use_automatic_f0_prediction = True,
323
+ n_flow_layer = 4,
324
  **kwargs):
325
 
326
  super().__init__()
 
343
  self.ssl_dim = ssl_dim
344
  self.vol_embedding = vol_embedding
345
  self.emb_g = nn.Embedding(n_speakers, gin_channels)
346
+ self.use_depthwise_conv = use_depthwise_conv
347
+ self.use_automatic_f0_prediction = use_automatic_f0_prediction
348
  if vol_embedding:
349
  self.emb_vol = nn.Linear(1, hidden_channels)
350
 
 
369
  "upsample_initial_channel": upsample_initial_channel,
370
  "upsample_kernel_sizes": upsample_kernel_sizes,
371
  "gin_channels": gin_channels,
372
+ "use_depthwise_conv":use_depthwise_conv
373
  }
374
+
375
+ modules.set_Conv1dModel(self.use_depthwise_conv)
376
+
377
+ if vocoder_name == "nsf-hifigan":
378
+ from vdecoder.hifigan.models import Generator
379
+ self.dec = Generator(h=hps)
380
+ elif vocoder_name == "nsf-snake-hifigan":
381
+ from vdecoder.hifiganwithsnake.models import Generator
382
+ self.dec = Generator(h=hps)
383
+ else:
384
+ print("[?] Unkown vocoder: use default(nsf-hifigan)")
385
+ from vdecoder.hifigan.models import Generator
386
+ self.dec = Generator(h=hps)
387
+
388
  self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
389
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels)
390
+ if self.use_automatic_f0_prediction:
391
+ self.f0_decoder = F0Decoder(
392
+ 1,
393
+ hidden_channels,
394
+ filter_channels,
395
+ n_heads,
396
+ n_layers,
397
+ kernel_size,
398
+ p_dropout,
399
+ spk_channels=gin_channels
400
+ )
401
  self.emb_uv = nn.Embedding(2, hidden_channels)
402
  self.character_mix = False
403
 
 
412
  g = self.emb_g(g).transpose(1,2)
413
 
414
  # vol proj
415
+ vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
416
 
417
  # ssl prenet
418
  x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
419
  x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
420
+
421
  # f0 predict
422
+ if self.use_automatic_f0_prediction:
423
+ lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
424
+ norm_lf0 = utils.normalize_f0(lf0, x_mask, uv)
425
+ pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
426
+ else:
427
+ lf0 = 0
428
+ norm_lf0 = 0
429
+ pred_lf0 = 0
430
  # encoder
431
  z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
432
  z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
 
440
 
441
  return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
442
 
443
+ @torch.no_grad()
444
  def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False, vol = None):
445
 
446
  if c.device == torch.device("cuda"):
 
462
 
463
  x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
464
  # vol proj
 
 
 
465
 
466
+ vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
467
+
468
+ x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol
469
+
470
+
471
+ if self.use_automatic_f0_prediction and predict_f0:
472
  lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
473
  norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
474
  pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
train.py CHANGED
@@ -1,39 +1,30 @@
1
  import logging
2
  import multiprocessing
 
3
  import time
4
 
5
- logging.getLogger('matplotlib').setLevel(logging.WARNING)
6
- logging.getLogger('numba').setLevel(logging.WARNING)
7
-
8
- import os
9
- import json
10
- import argparse
11
- import itertools
12
- import math
13
  import torch
14
- from torch import nn, optim
 
 
15
  from torch.nn import functional as F
 
16
  from torch.utils.data import DataLoader
17
  from torch.utils.tensorboard import SummaryWriter
18
- import torch.multiprocessing as mp
19
- import torch.distributed as dist
20
- from torch.nn.parallel import DistributedDataParallel as DDP
21
- from torch.cuda.amp import autocast, GradScaler
22
 
23
  import modules.commons as commons
24
  import utils
25
- from data_utils import TextAudioSpeakerLoader, TextAudioCollate
26
  from models import (
27
- SynthesizerTrn,
28
  MultiPeriodDiscriminator,
 
29
  )
30
- from modules.losses import (
31
- kl_loss,
32
- generator_loss, discriminator_loss, feature_loss
33
- )
34
-
35
  from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
36
 
 
 
 
37
  torch.backends.cudnn.benchmark = True
38
  global_step = 0
39
  start_time = time.time()
@@ -61,7 +52,7 @@ def run(rank, n_gpus, hps):
61
  utils.check_git_hash(hps.model_dir)
62
  writer = SummaryWriter(log_dir=hps.model_dir)
63
  writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
64
-
65
  # for pytorch on win, backend use gloo
66
  dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
67
  torch.manual_seed(hps.train.seed)
@@ -108,7 +99,7 @@ def run(rank, n_gpus, hps):
108
  name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth")
109
  global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1
110
  #global_step = (epoch_str - 1) * len(train_loader)
111
- except:
112
  print("load old checkpoint failed...")
113
  epoch_str = 1
114
  global_step = 0
@@ -148,6 +139,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
148
  train_loader, eval_loader = loaders
149
  if writers is not None:
150
  writer, writer_eval = writers
 
 
151
 
152
  # train_loader.batch_sampler.set_epoch(epoch)
153
  global global_step
@@ -169,8 +162,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
169
  hps.data.sampling_rate,
170
  hps.data.mel_fmin,
171
  hps.data.mel_fmax)
172
-
173
- with autocast(enabled=hps.train.fp16_run):
174
  y_hat, ids_slice, z_mask, \
175
  (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths,
176
  spec_lengths=lengths,vol = volume)
@@ -191,25 +184,26 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
191
  # Discriminator
192
  y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
193
 
194
- with autocast(enabled=False):
195
  loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
196
  loss_disc_all = loss_disc
197
-
198
  optim_d.zero_grad()
199
  scaler.scale(loss_disc_all).backward()
200
  scaler.unscale_(optim_d)
201
  grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
202
  scaler.step(optim_d)
 
203
 
204
- with autocast(enabled=hps.train.fp16_run):
205
  # Generator
206
  y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
207
- with autocast(enabled=False):
208
  loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
209
  loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
210
  loss_fm = feature_loss(fmap_r, fmap_g)
211
  loss_gen, losses_gen = generator_loss(y_d_hat_g)
212
- loss_lf0 = F.mse_loss(pred_lf0, lf0)
213
  loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0
214
  optim_g.zero_grad()
215
  scaler.scale(loss_gen_all).backward()
@@ -241,13 +235,17 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
241
  image_dict = {
242
  "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
243
  "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
244
- "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
245
- "all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
246
- pred_lf0[0, 0, :].detach().cpu().numpy()),
247
- "all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
248
- norm_lf0[0, 0, :].detach().cpu().numpy())
249
  }
250
 
 
 
 
 
 
 
 
 
251
  utils.summarize(
252
  writer=writer,
253
  global_step=global_step,
@@ -287,7 +285,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
287
  c = c[:1].cuda(0)
288
  f0 = f0[:1].cuda(0)
289
  uv= uv[:1].cuda(0)
290
- if volume!=None:
291
  volume = volume[:1].cuda(0)
292
  mel = spec_to_mel_torch(
293
  spec,
@@ -314,7 +312,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
314
  f"gt/audio_{batch_idx}": y[0]
315
  })
316
  image_dict.update({
317
- f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
318
  "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
319
  })
320
  utils.summarize(
@@ -328,4 +326,4 @@ def evaluate(hps, generator, eval_loader, writer_eval):
328
 
329
 
330
  if __name__ == "__main__":
331
- main()
 
1
  import logging
2
  import multiprocessing
3
+ import os
4
  import time
5
 
 
 
 
 
 
 
 
 
6
  import torch
7
+ import torch.distributed as dist
8
+ import torch.multiprocessing as mp
9
+ from torch.cuda.amp import GradScaler, autocast
10
  from torch.nn import functional as F
11
+ from torch.nn.parallel import DistributedDataParallel as DDP
12
  from torch.utils.data import DataLoader
13
  from torch.utils.tensorboard import SummaryWriter
 
 
 
 
14
 
15
  import modules.commons as commons
16
  import utils
17
+ from data_utils import TextAudioCollate, TextAudioSpeakerLoader
18
  from models import (
 
19
  MultiPeriodDiscriminator,
20
+ SynthesizerTrn,
21
  )
22
+ from modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
 
 
 
 
23
  from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
24
 
25
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
26
+ logging.getLogger('numba').setLevel(logging.WARNING)
27
+
28
  torch.backends.cudnn.benchmark = True
29
  global_step = 0
30
  start_time = time.time()
 
52
  utils.check_git_hash(hps.model_dir)
53
  writer = SummaryWriter(log_dir=hps.model_dir)
54
  writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
55
+
56
  # for pytorch on win, backend use gloo
57
  dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
58
  torch.manual_seed(hps.train.seed)
 
99
  name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth")
100
  global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1
101
  #global_step = (epoch_str - 1) * len(train_loader)
102
+ except Exception:
103
  print("load old checkpoint failed...")
104
  epoch_str = 1
105
  global_step = 0
 
139
  train_loader, eval_loader = loaders
140
  if writers is not None:
141
  writer, writer_eval = writers
142
+
143
+ half_type = torch.bfloat16 if hps.train.half_type=="bf16" else torch.float16
144
 
145
  # train_loader.batch_sampler.set_epoch(epoch)
146
  global global_step
 
162
  hps.data.sampling_rate,
163
  hps.data.mel_fmin,
164
  hps.data.mel_fmax)
165
+
166
+ with autocast(enabled=hps.train.fp16_run, dtype=half_type):
167
  y_hat, ids_slice, z_mask, \
168
  (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths,
169
  spec_lengths=lengths,vol = volume)
 
184
  # Discriminator
185
  y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
186
 
187
+ with autocast(enabled=False, dtype=half_type):
188
  loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
189
  loss_disc_all = loss_disc
190
+
191
  optim_d.zero_grad()
192
  scaler.scale(loss_disc_all).backward()
193
  scaler.unscale_(optim_d)
194
  grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
195
  scaler.step(optim_d)
196
+
197
 
198
+ with autocast(enabled=hps.train.fp16_run, dtype=half_type):
199
  # Generator
200
  y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
201
+ with autocast(enabled=False, dtype=half_type):
202
  loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
203
  loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
204
  loss_fm = feature_loss(fmap_r, fmap_g)
205
  loss_gen, losses_gen = generator_loss(y_d_hat_g)
206
+ loss_lf0 = F.mse_loss(pred_lf0, lf0) if net_g.module.use_automatic_f0_prediction else 0
207
  loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0
208
  optim_g.zero_grad()
209
  scaler.scale(loss_gen_all).backward()
 
235
  image_dict = {
236
  "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
237
  "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
238
+ "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy())
 
 
 
 
239
  }
240
 
241
+ if net_g.module.use_automatic_f0_prediction:
242
+ image_dict.update({
243
+ "all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
244
+ pred_lf0[0, 0, :].detach().cpu().numpy()),
245
+ "all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
246
+ norm_lf0[0, 0, :].detach().cpu().numpy())
247
+ })
248
+
249
  utils.summarize(
250
  writer=writer,
251
  global_step=global_step,
 
285
  c = c[:1].cuda(0)
286
  f0 = f0[:1].cuda(0)
287
  uv= uv[:1].cuda(0)
288
+ if volume is not None:
289
  volume = volume[:1].cuda(0)
290
  mel = spec_to_mel_torch(
291
  spec,
 
312
  f"gt/audio_{batch_idx}": y[0]
313
  })
314
  image_dict.update({
315
+ "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
316
  "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
317
  })
318
  utils.summarize(
 
326
 
327
 
328
  if __name__ == "__main__":
329
+ main()