Update vc_infer_pipeline.py
Browse files- vc_infer_pipeline.py +64 -48
vc_infer_pipeline.py
CHANGED
@@ -14,9 +14,6 @@ import torch
|
|
14 |
import torch.nn.functional as F
|
15 |
import torchcrepe
|
16 |
from scipy import signal
|
17 |
-
import logging
|
18 |
-
logger = logging.getLogger(__name__)
|
19 |
-
|
20 |
|
21 |
now_dir = os.getcwd()
|
22 |
sys.path.append(now_dir)
|
@@ -172,7 +169,7 @@ class VC(object):
|
|
172 |
model,
|
173 |
net_g,
|
174 |
sid,
|
175 |
-
|
176 |
pitch,
|
177 |
pitchf,
|
178 |
times,
|
@@ -182,8 +179,7 @@ class VC(object):
|
|
182 |
version,
|
183 |
protect,
|
184 |
):
|
185 |
-
|
186 |
-
feats = torch.from_numpy(audio)
|
187 |
if self.is_half:
|
188 |
feats = feats.half()
|
189 |
else:
|
@@ -193,7 +189,7 @@ class VC(object):
|
|
193 |
assert feats.dim() == 1, feats.dim()
|
194 |
feats = feats.view(1, -1)
|
195 |
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
196 |
-
|
197 |
inputs = {
|
198 |
"source": feats.to(self.device),
|
199 |
"padding_mask": padding_mask,
|
@@ -205,9 +201,6 @@ class VC(object):
|
|
205 |
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
206 |
if protect < 0.5 and pitch is not None and pitchf is not None:
|
207 |
feats0 = feats.clone()
|
208 |
-
|
209 |
-
logger.info(f"Feats shape after processing: {feats.shape}")
|
210 |
-
|
211 |
if (
|
212 |
index is not None
|
213 |
and big_npy is not None
|
@@ -216,30 +209,32 @@ class VC(object):
|
|
216 |
npy = feats[0].cpu().numpy()
|
217 |
if self.is_half:
|
218 |
npy = npy.astype("float32")
|
219 |
-
|
220 |
score, ix = index.search(npy, k=8)
|
221 |
weight = np.square(1 / score)
|
222 |
weight /= weight.sum(axis=1, keepdims=True)
|
223 |
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
224 |
-
|
225 |
if self.is_half:
|
226 |
npy = npy.astype("float16")
|
227 |
feats = (
|
228 |
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
|
229 |
+ (1 - index_rate) * feats
|
230 |
)
|
231 |
-
|
232 |
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
233 |
if protect < 0.5 and pitch is not None and pitchf is not None:
|
234 |
-
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
|
|
|
|
235 |
t1 = ttime()
|
236 |
-
p_len =
|
237 |
if feats.shape[1] < p_len:
|
238 |
p_len = feats.shape[1]
|
239 |
if pitch is not None and pitchf is not None:
|
240 |
pitch = pitch[:, :p_len]
|
241 |
pitchf = pitchf[:, :p_len]
|
242 |
-
|
243 |
if protect < 0.5 and pitch is not None and pitchf is not None:
|
244 |
pitchff = pitchf.clone()
|
245 |
pitchff[pitchf > 0] = 1
|
@@ -266,9 +261,8 @@ class VC(object):
|
|
266 |
t2 = ttime()
|
267 |
times[0] += t1 - t0
|
268 |
times[2] += t2 - t1
|
269 |
-
logger.info(f"VC output shape: {audio1.shape}")
|
270 |
return audio1
|
271 |
-
|
272 |
def pipeline(
|
273 |
self,
|
274 |
model,
|
@@ -290,7 +284,6 @@ class VC(object):
|
|
290 |
protect,
|
291 |
f0_file=None,
|
292 |
):
|
293 |
-
logger.info(f"Starting pipeline with audio shape: {audio.shape}")
|
294 |
if (
|
295 |
file_index != ""
|
296 |
and os.path.exists(file_index)
|
@@ -320,12 +313,6 @@ class VC(object):
|
|
320 |
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
321 |
)[0][0]
|
322 |
)
|
323 |
-
logger.info(f"Number of opt_ts: {len(opt_ts)}")
|
324 |
-
|
325 |
-
if len(opt_ts) == 0:
|
326 |
-
logger.info("No optimal time steps found. Processing entire audio.")
|
327 |
-
opt_ts = [audio.shape[0]]
|
328 |
-
|
329 |
s = 0
|
330 |
audio_opt = []
|
331 |
t = None
|
@@ -363,17 +350,52 @@ class VC(object):
|
|
363 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
364 |
t2 = ttime()
|
365 |
times[1] += t2 - t1
|
366 |
-
for
|
367 |
t = t // self.window * self.window
|
368 |
-
logger.info(f"Processing segment {i+1}/{len(opt_ts)}")
|
369 |
if if_f0 == 1:
|
370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
model,
|
372 |
net_g,
|
373 |
sid,
|
374 |
-
audio_pad[
|
375 |
-
pitch[:,
|
376 |
-
pitchf[:,
|
377 |
times,
|
378 |
index,
|
379 |
big_npy,
|
@@ -381,12 +403,14 @@ class VC(object):
|
|
381 |
version,
|
382 |
protect,
|
383 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
384 |
-
|
385 |
-
|
|
|
|
|
386 |
model,
|
387 |
net_g,
|
388 |
sid,
|
389 |
-
audio_pad[
|
390 |
None,
|
391 |
None,
|
392 |
times,
|
@@ -396,16 +420,7 @@ class VC(object):
|
|
396 |
version,
|
397 |
protect,
|
398 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
399 |
-
|
400 |
-
logger.info(f"Segment {i+1} shape: {segment.shape}")
|
401 |
-
audio_opt.append(segment)
|
402 |
-
s = t
|
403 |
-
|
404 |
-
logger.info(f"Number of audio segments: {len(audio_opt)}")
|
405 |
-
|
406 |
-
if not audio_opt:
|
407 |
-
raise ValueError("No audio segments were generated")
|
408 |
-
|
409 |
audio_opt = np.concatenate(audio_opt)
|
410 |
if rms_mix_rate != 1:
|
411 |
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
|
@@ -416,12 +431,13 @@ class VC(object):
|
|
416 |
if audio_max > 1:
|
417 |
max_int16 /= audio_max
|
418 |
audio_opt = (audio_opt * max_int16).astype(np.int16)
|
419 |
-
|
420 |
-
|
|
|
421 |
return audio_opt
|
422 |
-
|
423 |
def parallel_pipeline(self, tasks):
|
424 |
with ThreadPoolExecutor() as executor:
|
425 |
futures = [executor.submit(self.pipeline, *task) for task in tasks]
|
426 |
results = [future.result() for future in futures]
|
427 |
-
return results
|
|
|
14 |
import torch.nn.functional as F
|
15 |
import torchcrepe
|
16 |
from scipy import signal
|
|
|
|
|
|
|
17 |
|
18 |
now_dir = os.getcwd()
|
19 |
sys.path.append(now_dir)
|
|
|
169 |
model,
|
170 |
net_g,
|
171 |
sid,
|
172 |
+
audio0,
|
173 |
pitch,
|
174 |
pitchf,
|
175 |
times,
|
|
|
179 |
version,
|
180 |
protect,
|
181 |
):
|
182 |
+
feats = torch.from_numpy(audio0)
|
|
|
183 |
if self.is_half:
|
184 |
feats = feats.half()
|
185 |
else:
|
|
|
189 |
assert feats.dim() == 1, feats.dim()
|
190 |
feats = feats.view(1, -1)
|
191 |
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
192 |
+
|
193 |
inputs = {
|
194 |
"source": feats.to(self.device),
|
195 |
"padding_mask": padding_mask,
|
|
|
201 |
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
202 |
if protect < 0.5 and pitch is not None and pitchf is not None:
|
203 |
feats0 = feats.clone()
|
|
|
|
|
|
|
204 |
if (
|
205 |
index is not None
|
206 |
and big_npy is not None
|
|
|
209 |
npy = feats[0].cpu().numpy()
|
210 |
if self.is_half:
|
211 |
npy = npy.astype("float32")
|
212 |
+
|
213 |
score, ix = index.search(npy, k=8)
|
214 |
weight = np.square(1 / score)
|
215 |
weight /= weight.sum(axis=1, keepdims=True)
|
216 |
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
217 |
+
|
218 |
if self.is_half:
|
219 |
npy = npy.astype("float16")
|
220 |
feats = (
|
221 |
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
|
222 |
+ (1 - index_rate) * feats
|
223 |
)
|
224 |
+
|
225 |
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
226 |
if protect < 0.5 and pitch is not None and pitchf is not None:
|
227 |
+
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
228 |
+
0, 2, 1
|
229 |
+
)
|
230 |
t1 = ttime()
|
231 |
+
p_len = audio0.shape[0] // self.window
|
232 |
if feats.shape[1] < p_len:
|
233 |
p_len = feats.shape[1]
|
234 |
if pitch is not None and pitchf is not None:
|
235 |
pitch = pitch[:, :p_len]
|
236 |
pitchf = pitchf[:, :p_len]
|
237 |
+
|
238 |
if protect < 0.5 and pitch is not None and pitchf is not None:
|
239 |
pitchff = pitchf.clone()
|
240 |
pitchff[pitchf > 0] = 1
|
|
|
261 |
t2 = ttime()
|
262 |
times[0] += t1 - t0
|
263 |
times[2] += t2 - t1
|
|
|
264 |
return audio1
|
265 |
+
|
266 |
def pipeline(
|
267 |
self,
|
268 |
model,
|
|
|
284 |
protect,
|
285 |
f0_file=None,
|
286 |
):
|
|
|
287 |
if (
|
288 |
file_index != ""
|
289 |
and os.path.exists(file_index)
|
|
|
313 |
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
314 |
)[0][0]
|
315 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
s = 0
|
317 |
audio_opt = []
|
318 |
t = None
|
|
|
350 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
351 |
t2 = ttime()
|
352 |
times[1] += t2 - t1
|
353 |
+
for t in opt_ts:
|
354 |
t = t // self.window * self.window
|
|
|
355 |
if if_f0 == 1:
|
356 |
+
audio_opt.append(
|
357 |
+
self.vc(
|
358 |
+
model,
|
359 |
+
net_g,
|
360 |
+
sid,
|
361 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
362 |
+
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
|
363 |
+
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
|
364 |
+
times,
|
365 |
+
index,
|
366 |
+
big_npy,
|
367 |
+
index_rate,
|
368 |
+
version,
|
369 |
+
protect,
|
370 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
371 |
+
)
|
372 |
+
else:
|
373 |
+
audio_opt.append(
|
374 |
+
self.vc(
|
375 |
+
model,
|
376 |
+
net_g,
|
377 |
+
sid,
|
378 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
379 |
+
None,
|
380 |
+
None,
|
381 |
+
times,
|
382 |
+
index,
|
383 |
+
big_npy,
|
384 |
+
index_rate,
|
385 |
+
version,
|
386 |
+
protect,
|
387 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
388 |
+
)
|
389 |
+
s = t
|
390 |
+
if if_f0 == 1:
|
391 |
+
audio_opt.append(
|
392 |
+
self.vc(
|
393 |
model,
|
394 |
net_g,
|
395 |
sid,
|
396 |
+
audio_pad[t:],
|
397 |
+
pitch[:, t // self.window :] if t is not None else pitch,
|
398 |
+
pitchf[:, t // self.window :] if t is not None else pitchf,
|
399 |
times,
|
400 |
index,
|
401 |
big_npy,
|
|
|
403 |
version,
|
404 |
protect,
|
405 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
406 |
+
)
|
407 |
+
else:
|
408 |
+
audio_opt.append(
|
409 |
+
self.vc(
|
410 |
model,
|
411 |
net_g,
|
412 |
sid,
|
413 |
+
audio_pad[t:],
|
414 |
None,
|
415 |
None,
|
416 |
times,
|
|
|
420 |
version,
|
421 |
protect,
|
422 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
423 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
audio_opt = np.concatenate(audio_opt)
|
425 |
if rms_mix_rate != 1:
|
426 |
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
|
|
|
431 |
if audio_max > 1:
|
432 |
max_int16 /= audio_max
|
433 |
audio_opt = (audio_opt * max_int16).astype(np.int16)
|
434 |
+
del pitch, pitchf, sid
|
435 |
+
if torch.cuda.is_available():
|
436 |
+
torch.cuda.empty_cache()
|
437 |
return audio_opt
|
438 |
+
|
439 |
def parallel_pipeline(self, tasks):
|
440 |
with ThreadPoolExecutor() as executor:
|
441 |
futures = [executor.submit(self.pipeline, *task) for task in tasks]
|
442 |
results = [future.result() for future in futures]
|
443 |
+
return results
|