ArkanDash commited on
Commit
31a8225
1 Parent(s): d03d6be

feat: update infer

Browse files
infer_pack/models_onnx.py CHANGED
@@ -550,6 +550,7 @@ class SynthesizerTrnMsNSFsidM(nn.Module):
550
  spk_embed_dim,
551
  gin_channels,
552
  sr,
 
553
  **kwargs
554
  ):
555
  super().__init__()
@@ -573,7 +574,7 @@ class SynthesizerTrnMsNSFsidM(nn.Module):
573
  self.gin_channels = gin_channels
574
  # self.hop_length = hop_length#
575
  self.spk_embed_dim = spk_embed_dim
576
- if self.gin_channels == 256:
577
  self.enc_p = TextEncoder256(
578
  inter_channels,
579
  hidden_channels,
 
550
  spk_embed_dim,
551
  gin_channels,
552
  sr,
553
+ version,
554
  **kwargs
555
  ):
556
  super().__init__()
 
574
  self.gin_channels = gin_channels
575
  # self.hop_length = hop_length#
576
  self.spk_embed_dim = spk_embed_dim
577
+ if version == "v1":
578
  self.enc_p = TextEncoder256(
579
  inter_channels,
580
  hidden_channels,
infer_pack/modules/F0Predictor/DioF0Predictor.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import pyworld
3
+ import numpy as np
4
+
5
+
6
+ class DioF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def resize_f0(self, x, target_len):
52
+ source = np.array(x)
53
+ source[source < 0.001] = np.nan
54
+ target = np.interp(
55
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
56
+ np.arange(0, len(source)),
57
+ source,
58
+ )
59
+ res = np.nan_to_num(target)
60
+ return res
61
+
62
+ def compute_f0(self, wav, p_len=None):
63
+ if p_len is None:
64
+ p_len = wav.shape[0] // self.hop_length
65
+ f0, t = pyworld.dio(
66
+ wav.astype(np.double),
67
+ fs=self.sampling_rate,
68
+ f0_floor=self.f0_min,
69
+ f0_ceil=self.f0_max,
70
+ frame_period=1000 * self.hop_length / self.sampling_rate,
71
+ )
72
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
73
+ for index, pitch in enumerate(f0):
74
+ f0[index] = round(pitch, 1)
75
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
76
+
77
+ def compute_f0_uv(self, wav, p_len=None):
78
+ if p_len is None:
79
+ p_len = wav.shape[0] // self.hop_length
80
+ f0, t = pyworld.dio(
81
+ wav.astype(np.double),
82
+ fs=self.sampling_rate,
83
+ f0_floor=self.f0_min,
84
+ f0_ceil=self.f0_max,
85
+ frame_period=1000 * self.hop_length / self.sampling_rate,
86
+ )
87
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
88
+ for index, pitch in enumerate(f0):
89
+ f0[index] = round(pitch, 1)
90
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer_pack/modules/F0Predictor/F0Predictor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class F0Predictor(object):
2
+ def compute_f0(self, wav, p_len):
3
+ """
4
+ input: wav:[signal_length]
5
+ p_len:int
6
+ output: f0:[signal_length//hop_length]
7
+ """
8
+ pass
9
+
10
+ def compute_f0_uv(self, wav, p_len):
11
+ """
12
+ input: wav:[signal_length]
13
+ p_len:int
14
+ output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15
+ """
16
+ pass
infer_pack/modules/F0Predictor/HarvestF0Predictor.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import pyworld
3
+ import numpy as np
4
+
5
+
6
+ class HarvestF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def resize_f0(self, x, target_len):
52
+ source = np.array(x)
53
+ source[source < 0.001] = np.nan
54
+ target = np.interp(
55
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
56
+ np.arange(0, len(source)),
57
+ source,
58
+ )
59
+ res = np.nan_to_num(target)
60
+ return res
61
+
62
+ def compute_f0(self, wav, p_len=None):
63
+ if p_len is None:
64
+ p_len = wav.shape[0] // self.hop_length
65
+ f0, t = pyworld.harvest(
66
+ wav.astype(np.double),
67
+ fs=self.hop_length,
68
+ f0_ceil=self.f0_max,
69
+ f0_floor=self.f0_min,
70
+ frame_period=1000 * self.hop_length / self.sampling_rate,
71
+ )
72
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
73
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
74
+
75
+ def compute_f0_uv(self, wav, p_len=None):
76
+ if p_len is None:
77
+ p_len = wav.shape[0] // self.hop_length
78
+ f0, t = pyworld.harvest(
79
+ wav.astype(np.double),
80
+ fs=self.sampling_rate,
81
+ f0_floor=self.f0_min,
82
+ f0_ceil=self.f0_max,
83
+ frame_period=1000 * self.hop_length / self.sampling_rate,
84
+ )
85
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
86
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
infer_pack/modules/F0Predictor/PMF0Predictor.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
+ import parselmouth
3
+ import numpy as np
4
+
5
+
6
+ class PMF0Predictor(F0Predictor):
7
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
+ self.hop_length = hop_length
9
+ self.f0_min = f0_min
10
+ self.f0_max = f0_max
11
+ self.sampling_rate = sampling_rate
12
+
13
+ def interpolate_f0(self, f0):
14
+ """
15
+ 对F0进行插值处理
16
+ """
17
+
18
+ data = np.reshape(f0, (f0.size, 1))
19
+
20
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
+ vuv_vector[data > 0.0] = 1.0
22
+ vuv_vector[data <= 0.0] = 0.0
23
+
24
+ ip_data = data
25
+
26
+ frame_number = data.size
27
+ last_value = 0.0
28
+ for i in range(frame_number):
29
+ if data[i] <= 0.0:
30
+ j = i + 1
31
+ for j in range(i + 1, frame_number):
32
+ if data[j] > 0.0:
33
+ break
34
+ if j < frame_number - 1:
35
+ if last_value > 0.0:
36
+ step = (data[j] - data[i - 1]) / float(j - i)
37
+ for k in range(i, j):
38
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
39
+ else:
40
+ for k in range(i, j):
41
+ ip_data[k] = data[j]
42
+ else:
43
+ for k in range(i, frame_number):
44
+ ip_data[k] = last_value
45
+ else:
46
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
47
+ last_value = data[i]
48
+
49
+ return ip_data[:, 0], vuv_vector[:, 0]
50
+
51
+ def compute_f0(self, wav, p_len=None):
52
+ x = wav
53
+ if p_len is None:
54
+ p_len = x.shape[0] // self.hop_length
55
+ else:
56
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
57
+ time_step = self.hop_length / self.sampling_rate * 1000
58
+ f0 = (
59
+ parselmouth.Sound(x, self.sampling_rate)
60
+ .to_pitch_ac(
61
+ time_step=time_step / 1000,
62
+ voicing_threshold=0.6,
63
+ pitch_floor=self.f0_min,
64
+ pitch_ceiling=self.f0_max,
65
+ )
66
+ .selected_array["frequency"]
67
+ )
68
+
69
+ pad_size = (p_len - len(f0) + 1) // 2
70
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
71
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
72
+ f0, uv = self.interpolate_f0(f0)
73
+ return f0
74
+
75
+ def compute_f0_uv(self, wav, p_len=None):
76
+ x = wav
77
+ if p_len is None:
78
+ p_len = x.shape[0] // self.hop_length
79
+ else:
80
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
81
+ time_step = self.hop_length / self.sampling_rate * 1000
82
+ f0 = (
83
+ parselmouth.Sound(x, self.sampling_rate)
84
+ .to_pitch_ac(
85
+ time_step=time_step / 1000,
86
+ voicing_threshold=0.6,
87
+ pitch_floor=self.f0_min,
88
+ pitch_ceiling=self.f0_max,
89
+ )
90
+ .selected_array["frequency"]
91
+ )
92
+
93
+ pad_size = (p_len - len(f0) + 1) // 2
94
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
95
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
96
+ f0, uv = self.interpolate_f0(f0)
97
+ return f0, uv
infer_pack/modules/F0Predictor/__init__.py ADDED
File without changes
infer_pack/onnx_inference.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import librosa
3
+ import numpy as np
4
+ import soundfile
5
+
6
+ class ContentVec:
7
+ def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
8
+ print("load model(s) from {}".format(vec_path))
9
+ if device == "cpu" or device is None:
10
+ providers = ["CPUExecutionProvider"]
11
+ elif device == "cuda":
12
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
13
+ elif device == "dml":
14
+ providers = ["DmlExecutionProvider"]
15
+ else:
16
+ raise RuntimeError("Unsportted Device")
17
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
18
+
19
+ def __call__(self, wav):
20
+ return self.forward(wav)
21
+
22
+ def forward(self, wav):
23
+ feats = wav
24
+ if feats.ndim == 2: # double channels
25
+ feats = feats.mean(-1)
26
+ assert feats.ndim == 1, feats.ndim
27
+ feats = np.expand_dims(np.expand_dims(feats, 0), 0)
28
+ onnx_input = {self.model.get_inputs()[0].name: feats}
29
+ logits = self.model.run(None, onnx_input)[0]
30
+ return logits.transpose(0, 2, 1)
31
+
32
+
33
+ def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
34
+ if f0_predictor == "pm":
35
+ from infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
36
+
37
+ f0_predictor_object = PMF0Predictor(
38
+ hop_length=hop_length, sampling_rate=sampling_rate
39
+ )
40
+ elif f0_predictor == "harvest":
41
+ from infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
42
+
43
+ f0_predictor_object = HarvestF0Predictor(
44
+ hop_length=hop_length, sampling_rate=sampling_rate
45
+ )
46
+ elif f0_predictor == "dio":
47
+ from infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
48
+
49
+ f0_predictor_object = DioF0Predictor(
50
+ hop_length=hop_length, sampling_rate=sampling_rate
51
+ )
52
+ else:
53
+ raise Exception("Unknown f0 predictor")
54
+ return f0_predictor_object
55
+
56
+
57
+ class OnnxRVC:
58
+ def __init__(
59
+ self,
60
+ model_path,
61
+ sr=40000,
62
+ hop_size=512,
63
+ vec_path="vec-768-layer-12",
64
+ device="cpu",
65
+ ):
66
+ vec_path = f"pretrained/{vec_path}.onnx"
67
+ self.vec_model = ContentVec(vec_path, device)
68
+ if device == "cpu" or device is None:
69
+ providers = ["CPUExecutionProvider"]
70
+ elif device == "cuda":
71
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
72
+ elif device == "dml":
73
+ providers = ["DmlExecutionProvider"]
74
+ else:
75
+ raise RuntimeError("Unsportted Device")
76
+ self.model = onnxruntime.InferenceSession(model_path, providers=providers)
77
+ self.sampling_rate = sr
78
+ self.hop_size = hop_size
79
+
80
+ def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
81
+ onnx_input = {
82
+ self.model.get_inputs()[0].name: hubert,
83
+ self.model.get_inputs()[1].name: hubert_length,
84
+ self.model.get_inputs()[2].name: pitch,
85
+ self.model.get_inputs()[3].name: pitchf,
86
+ self.model.get_inputs()[4].name: ds,
87
+ self.model.get_inputs()[5].name: rnd,
88
+ }
89
+ return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
90
+
91
+ def inference(
92
+ self,
93
+ raw_path,
94
+ sid,
95
+ f0_method="dio",
96
+ f0_up_key=0,
97
+ pad_time=0.5,
98
+ cr_threshold=0.02,
99
+ ):
100
+ f0_min = 50
101
+ f0_max = 1100
102
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
103
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
104
+ f0_predictor = get_f0_predictor(
105
+ f0_method,
106
+ hop_length=self.hop_size,
107
+ sampling_rate=self.sampling_rate,
108
+ threshold=cr_threshold,
109
+ )
110
+ wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
111
+ org_length = len(wav)
112
+ if org_length / sr > 50.0:
113
+ raise RuntimeError("Reached Max Length")
114
+
115
+ wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
116
+ wav16k = wav16k
117
+
118
+ hubert = self.vec_model(wav16k)
119
+ hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
120
+ hubert_length = hubert.shape[1]
121
+
122
+ pitchf = f0_predictor.compute_f0(wav, hubert_length)
123
+ pitchf = pitchf * 2 ** (f0_up_key / 12)
124
+ pitch = pitchf.copy()
125
+ f0_mel = 1127 * np.log(1 + pitch / 700)
126
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
127
+ f0_mel_max - f0_mel_min
128
+ ) + 1
129
+ f0_mel[f0_mel <= 1] = 1
130
+ f0_mel[f0_mel > 255] = 255
131
+ pitch = np.rint(f0_mel).astype(np.int64)
132
+
133
+ pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
134
+ pitch = pitch.reshape(1, len(pitch))
135
+ ds = np.array([sid]).astype(np.int64)
136
+
137
+ rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
138
+ hubert_length = np.array([hubert_length]).astype(np.int64)
139
+
140
+ out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
141
+ out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
142
+ return out_wav[0:org_length]
vc_infer_pipeline.py CHANGED
@@ -184,7 +184,7 @@ class VC(object):
184
  with torch.no_grad():
185
  logits = model.extract_features(**inputs)
186
  feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
187
- if protect < 0.5:
188
  feats0 = feats.clone()
189
  if (
190
  isinstance(index, type(None)) == False
@@ -211,7 +211,7 @@ class VC(object):
211
  )
212
 
213
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
214
- if protect < 0.5:
215
  feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
216
  0, 2, 1
217
  )
@@ -223,7 +223,7 @@ class VC(object):
223
  pitch = pitch[:, :p_len]
224
  pitchf = pitchf[:, :p_len]
225
 
226
- if protect < 0.5:
227
  pitchff = pitchf.clone()
228
  pitchff[pitchf > 0] = 1
229
  pitchff[pitchf < 1] = protect
 
184
  with torch.no_grad():
185
  logits = model.extract_features(**inputs)
186
  feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
187
+ if protect < 0.5 and pitch!=None and pitchf!=None:
188
  feats0 = feats.clone()
189
  if (
190
  isinstance(index, type(None)) == False
 
211
  )
212
 
213
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
214
+ if protect < 0.5 and pitch!=None and pitchf!=None:
215
  feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
216
  0, 2, 1
217
  )
 
223
  pitch = pitch[:, :p_len]
224
  pitchf = pitchf[:, :p_len]
225
 
226
+ if protect < 0.5 and pitch!=None and pitchf!=None:
227
  pitchff = pitchf.clone()
228
  pitchff[pitchf > 0] = 1
229
  pitchff[pitchf < 1] = protect