MAZALA2024 commited on
Commit
359c2d0
·
verified ·
1 Parent(s): 8077420

Update vc_infer_pipeline.py

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +64 -48
vc_infer_pipeline.py CHANGED
@@ -14,9 +14,6 @@ import torch
14
  import torch.nn.functional as F
15
  import torchcrepe
16
  from scipy import signal
17
- import logging
18
- logger = logging.getLogger(__name__)
19
-
20
 
21
  now_dir = os.getcwd()
22
  sys.path.append(now_dir)
@@ -172,7 +169,7 @@ class VC(object):
172
  model,
173
  net_g,
174
  sid,
175
- audio,
176
  pitch,
177
  pitchf,
178
  times,
@@ -182,8 +179,7 @@ class VC(object):
182
  version,
183
  protect,
184
  ):
185
- logger.info(f"VC input shape: {audio.shape}")
186
- feats = torch.from_numpy(audio)
187
  if self.is_half:
188
  feats = feats.half()
189
  else:
@@ -193,7 +189,7 @@ class VC(object):
193
  assert feats.dim() == 1, feats.dim()
194
  feats = feats.view(1, -1)
195
  padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
196
-
197
  inputs = {
198
  "source": feats.to(self.device),
199
  "padding_mask": padding_mask,
@@ -205,9 +201,6 @@ class VC(object):
205
  feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
206
  if protect < 0.5 and pitch is not None and pitchf is not None:
207
  feats0 = feats.clone()
208
-
209
- logger.info(f"Feats shape after processing: {feats.shape}")
210
-
211
  if (
212
  index is not None
213
  and big_npy is not None
@@ -216,30 +209,32 @@ class VC(object):
216
  npy = feats[0].cpu().numpy()
217
  if self.is_half:
218
  npy = npy.astype("float32")
219
-
220
  score, ix = index.search(npy, k=8)
221
  weight = np.square(1 / score)
222
  weight /= weight.sum(axis=1, keepdims=True)
223
  npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
224
-
225
  if self.is_half:
226
  npy = npy.astype("float16")
227
  feats = (
228
  torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
229
  + (1 - index_rate) * feats
230
  )
231
-
232
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
233
  if protect < 0.5 and pitch is not None and pitchf is not None:
234
- feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
 
 
235
  t1 = ttime()
236
- p_len = audio.shape[0] // self.window
237
  if feats.shape[1] < p_len:
238
  p_len = feats.shape[1]
239
  if pitch is not None and pitchf is not None:
240
  pitch = pitch[:, :p_len]
241
  pitchf = pitchf[:, :p_len]
242
-
243
  if protect < 0.5 and pitch is not None and pitchf is not None:
244
  pitchff = pitchf.clone()
245
  pitchff[pitchf > 0] = 1
@@ -266,9 +261,8 @@ class VC(object):
266
  t2 = ttime()
267
  times[0] += t1 - t0
268
  times[2] += t2 - t1
269
- logger.info(f"VC output shape: {audio1.shape}")
270
  return audio1
271
-
272
  def pipeline(
273
  self,
274
  model,
@@ -290,7 +284,6 @@ class VC(object):
290
  protect,
291
  f0_file=None,
292
  ):
293
- logger.info(f"Starting pipeline with audio shape: {audio.shape}")
294
  if (
295
  file_index != ""
296
  and os.path.exists(file_index)
@@ -320,12 +313,6 @@ class VC(object):
320
  == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
321
  )[0][0]
322
  )
323
- logger.info(f"Number of opt_ts: {len(opt_ts)}")
324
-
325
- if len(opt_ts) == 0:
326
- logger.info("No optimal time steps found. Processing entire audio.")
327
- opt_ts = [audio.shape[0]]
328
-
329
  s = 0
330
  audio_opt = []
331
  t = None
@@ -363,17 +350,52 @@ class VC(object):
363
  pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
364
  t2 = ttime()
365
  times[1] += t2 - t1
366
- for i, t in enumerate(opt_ts):
367
  t = t // self.window * self.window
368
- logger.info(f"Processing segment {i+1}/{len(opt_ts)}")
369
  if if_f0 == 1:
370
- segment = self.vc(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  model,
372
  net_g,
373
  sid,
374
- audio_pad[s : t + self.t_pad2 + self.window],
375
- pitch[:, s // self.window : (t + self.t_pad2) // self.window],
376
- pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
377
  times,
378
  index,
379
  big_npy,
@@ -381,12 +403,14 @@ class VC(object):
381
  version,
382
  protect,
383
  )[self.t_pad_tgt : -self.t_pad_tgt]
384
- else:
385
- segment = self.vc(
 
 
386
  model,
387
  net_g,
388
  sid,
389
- audio_pad[s : t + self.t_pad2 + self.window],
390
  None,
391
  None,
392
  times,
@@ -396,16 +420,7 @@ class VC(object):
396
  version,
397
  protect,
398
  )[self.t_pad_tgt : -self.t_pad_tgt]
399
-
400
- logger.info(f"Segment {i+1} shape: {segment.shape}")
401
- audio_opt.append(segment)
402
- s = t
403
-
404
- logger.info(f"Number of audio segments: {len(audio_opt)}")
405
-
406
- if not audio_opt:
407
- raise ValueError("No audio segments were generated")
408
-
409
  audio_opt = np.concatenate(audio_opt)
410
  if rms_mix_rate != 1:
411
  audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
@@ -416,12 +431,13 @@ class VC(object):
416
  if audio_max > 1:
417
  max_int16 /= audio_max
418
  audio_opt = (audio_opt * max_int16).astype(np.int16)
419
-
420
- logger.info(f"Final audio_opt shape: {audio_opt.shape}")
 
421
  return audio_opt
422
-
423
  def parallel_pipeline(self, tasks):
424
  with ThreadPoolExecutor() as executor:
425
  futures = [executor.submit(self.pipeline, *task) for task in tasks]
426
  results = [future.result() for future in futures]
427
- return results
 
14
  import torch.nn.functional as F
15
  import torchcrepe
16
  from scipy import signal
 
 
 
17
 
18
  now_dir = os.getcwd()
19
  sys.path.append(now_dir)
 
169
  model,
170
  net_g,
171
  sid,
172
+ audio0,
173
  pitch,
174
  pitchf,
175
  times,
 
179
  version,
180
  protect,
181
  ):
182
+ feats = torch.from_numpy(audio0)
 
183
  if self.is_half:
184
  feats = feats.half()
185
  else:
 
189
  assert feats.dim() == 1, feats.dim()
190
  feats = feats.view(1, -1)
191
  padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
192
+
193
  inputs = {
194
  "source": feats.to(self.device),
195
  "padding_mask": padding_mask,
 
201
  feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
202
  if protect < 0.5 and pitch is not None and pitchf is not None:
203
  feats0 = feats.clone()
 
 
 
204
  if (
205
  index is not None
206
  and big_npy is not None
 
209
  npy = feats[0].cpu().numpy()
210
  if self.is_half:
211
  npy = npy.astype("float32")
212
+
213
  score, ix = index.search(npy, k=8)
214
  weight = np.square(1 / score)
215
  weight /= weight.sum(axis=1, keepdims=True)
216
  npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
217
+
218
  if self.is_half:
219
  npy = npy.astype("float16")
220
  feats = (
221
  torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
222
  + (1 - index_rate) * feats
223
  )
224
+
225
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
226
  if protect < 0.5 and pitch is not None and pitchf is not None:
227
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
228
+ 0, 2, 1
229
+ )
230
  t1 = ttime()
231
+ p_len = audio0.shape[0] // self.window
232
  if feats.shape[1] < p_len:
233
  p_len = feats.shape[1]
234
  if pitch is not None and pitchf is not None:
235
  pitch = pitch[:, :p_len]
236
  pitchf = pitchf[:, :p_len]
237
+
238
  if protect < 0.5 and pitch is not None and pitchf is not None:
239
  pitchff = pitchf.clone()
240
  pitchff[pitchf > 0] = 1
 
261
  t2 = ttime()
262
  times[0] += t1 - t0
263
  times[2] += t2 - t1
 
264
  return audio1
265
+
266
  def pipeline(
267
  self,
268
  model,
 
284
  protect,
285
  f0_file=None,
286
  ):
 
287
  if (
288
  file_index != ""
289
  and os.path.exists(file_index)
 
313
  == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
314
  )[0][0]
315
  )
 
 
 
 
 
 
316
  s = 0
317
  audio_opt = []
318
  t = None
 
350
  pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
351
  t2 = ttime()
352
  times[1] += t2 - t1
353
+ for t in opt_ts:
354
  t = t // self.window * self.window
 
355
  if if_f0 == 1:
356
+ audio_opt.append(
357
+ self.vc(
358
+ model,
359
+ net_g,
360
+ sid,
361
+ audio_pad[s : t + self.t_pad2 + self.window],
362
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
363
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
364
+ times,
365
+ index,
366
+ big_npy,
367
+ index_rate,
368
+ version,
369
+ protect,
370
+ )[self.t_pad_tgt : -self.t_pad_tgt]
371
+ )
372
+ else:
373
+ audio_opt.append(
374
+ self.vc(
375
+ model,
376
+ net_g,
377
+ sid,
378
+ audio_pad[s : t + self.t_pad2 + self.window],
379
+ None,
380
+ None,
381
+ times,
382
+ index,
383
+ big_npy,
384
+ index_rate,
385
+ version,
386
+ protect,
387
+ )[self.t_pad_tgt : -self.t_pad_tgt]
388
+ )
389
+ s = t
390
+ if if_f0 == 1:
391
+ audio_opt.append(
392
+ self.vc(
393
  model,
394
  net_g,
395
  sid,
396
+ audio_pad[t:],
397
+ pitch[:, t // self.window :] if t is not None else pitch,
398
+ pitchf[:, t // self.window :] if t is not None else pitchf,
399
  times,
400
  index,
401
  big_npy,
 
403
  version,
404
  protect,
405
  )[self.t_pad_tgt : -self.t_pad_tgt]
406
+ )
407
+ else:
408
+ audio_opt.append(
409
+ self.vc(
410
  model,
411
  net_g,
412
  sid,
413
+ audio_pad[t:],
414
  None,
415
  None,
416
  times,
 
420
  version,
421
  protect,
422
  )[self.t_pad_tgt : -self.t_pad_tgt]
423
+ )
 
 
 
 
 
 
 
 
 
424
  audio_opt = np.concatenate(audio_opt)
425
  if rms_mix_rate != 1:
426
  audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
 
431
  if audio_max > 1:
432
  max_int16 /= audio_max
433
  audio_opt = (audio_opt * max_int16).astype(np.int16)
434
+ del pitch, pitchf, sid
435
+ if torch.cuda.is_available():
436
+ torch.cuda.empty_cache()
437
  return audio_opt
438
+
439
  def parallel_pipeline(self, tasks):
440
  with ThreadPoolExecutor() as executor:
441
  futures = [executor.submit(self.pipeline, *task) for task in tasks]
442
  results = [future.result() for future in futures]
443
+ return results