Spaces:

mazalaai
/

tts

Sleeping

App Files Files Community

MAZALA2024 commited on Oct 19, 2024

Commit

359c2d0

verified ·

1 Parent(s): 8077420

Update vc_infer_pipeline.py

Browse files

Files changed (1) hide show

vc_infer_pipeline.py +64 -48

vc_infer_pipeline.py CHANGED Viewed

@@ -14,9 +14,6 @@ import torch
 import torch.nn.functional as F
 import torchcrepe
 from scipy import signal
-import logging
-logger = logging.getLogger(__name__)
 now_dir = os.getcwd()
 sys.path.append(now_dir)
@@ -172,7 +169,7 @@ class VC(object):
         model,
         net_g,
         sid,
-        audio,
         pitch,
         pitchf,
         times,
@@ -182,8 +179,7 @@ class VC(object):
         version,
         protect,
     ):
-        logger.info(f"VC input shape: {audio.shape}")
-        feats = torch.from_numpy(audio)
         if self.is_half:
             feats = feats.half()
         else:
@@ -193,7 +189,7 @@ class VC(object):
         assert feats.dim() == 1, feats.dim()
         feats = feats.view(1, -1)
         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
         inputs = {
             "source": feats.to(self.device),
             "padding_mask": padding_mask,
@@ -205,9 +201,6 @@ class VC(object):
             feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
         if protect < 0.5 and pitch is not None and pitchf is not None:
             feats0 = feats.clone()
-        logger.info(f"Feats shape after processing: {feats.shape}")
         if (
             index is not None
             and big_npy is not None
@@ -216,30 +209,32 @@ class VC(object):
             npy = feats[0].cpu().numpy()
             if self.is_half:
                 npy = npy.astype("float32")
             score, ix = index.search(npy, k=8)
             weight = np.square(1 / score)
             weight /= weight.sum(axis=1, keepdims=True)
             npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
             if self.is_half:
                 npy = npy.astype("float16")
             feats = (
                 torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
                 + (1 - index_rate) * feats
             )
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
         if protect < 0.5 and pitch is not None and pitchf is not None:
-            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
         t1 = ttime()
-        p_len = audio.shape[0] // self.window
         if feats.shape[1] < p_len:
             p_len = feats.shape[1]
             if pitch is not None and pitchf is not None:
                 pitch = pitch[:, :p_len]
                 pitchf = pitchf[:, :p_len]
         if protect < 0.5 and pitch is not None and pitchf is not None:
             pitchff = pitchf.clone()
             pitchff[pitchf > 0] = 1
@@ -266,9 +261,8 @@ class VC(object):
         t2 = ttime()
         times[0] += t1 - t0
         times[2] += t2 - t1
-        logger.info(f"VC output shape: {audio1.shape}")
         return audio1
     def pipeline(
         self,
         model,
@@ -290,7 +284,6 @@ class VC(object):
         protect,
         f0_file=None,
     ):
-        logger.info(f"Starting pipeline with audio shape: {audio.shape}")
         if (
             file_index != ""
             and os.path.exists(file_index)
@@ -320,12 +313,6 @@ class VC(object):
                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
                     )[0][0]
                 )
-        logger.info(f"Number of opt_ts: {len(opt_ts)}")
-        if len(opt_ts) == 0:
-            logger.info("No optimal time steps found. Processing entire audio.")
-            opt_ts = [audio.shape[0]]
         s = 0
         audio_opt = []
         t = None
@@ -363,17 +350,52 @@ class VC(object):
             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
         t2 = ttime()
         times[1] += t2 - t1
-        for i, t in enumerate(opt_ts):
             t = t // self.window * self.window
-            logger.info(f"Processing segment {i+1}/{len(opt_ts)}")
             if if_f0 == 1:
-                segment = self.vc(
                     model,
                     net_g,
                     sid,
-                    audio_pad[s : t + self.t_pad2 + self.window],
-                    pitch[:, s // self.window : (t + self.t_pad2) // self.window],
-                    pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
                     times,
                     index,
                     big_npy,
@@ -381,12 +403,14 @@ class VC(object):
                     version,
                     protect,
                 )[self.t_pad_tgt : -self.t_pad_tgt]
-            else:
-                segment = self.vc(
                     model,
                     net_g,
                     sid,
-                    audio_pad[s : t + self.t_pad2 + self.window],
                     None,
                     None,
                     times,
@@ -396,16 +420,7 @@ class VC(object):
                     version,
                     protect,
                 )[self.t_pad_tgt : -self.t_pad_tgt]
-            logger.info(f"Segment {i+1} shape: {segment.shape}")
-            audio_opt.append(segment)
-            s = t
-        logger.info(f"Number of audio segments: {len(audio_opt)}")
-        if not audio_opt:
-            raise ValueError("No audio segments were generated")
         audio_opt = np.concatenate(audio_opt)
         if rms_mix_rate != 1:
             audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
@@ -416,12 +431,13 @@ class VC(object):
         if audio_max > 1:
             max_int16 /= audio_max
         audio_opt = (audio_opt * max_int16).astype(np.int16)
-        logger.info(f"Final audio_opt shape: {audio_opt.shape}")
         return audio_opt
     def parallel_pipeline(self, tasks):
         with ThreadPoolExecutor() as executor:
             futures = [executor.submit(self.pipeline, *task) for task in tasks]
             results = [future.result() for future in futures]
-        return results

 import torch.nn.functional as F
 import torchcrepe
 from scipy import signal
 now_dir = os.getcwd()
 sys.path.append(now_dir)
         model,
         net_g,
         sid,
+        audio0,
         pitch,
         pitchf,
         times,
         version,
         protect,
     ):
+        feats = torch.from_numpy(audio0)
         if self.is_half:
             feats = feats.half()
         else:
         assert feats.dim() == 1, feats.dim()
         feats = feats.view(1, -1)
         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
         inputs = {
             "source": feats.to(self.device),
             "padding_mask": padding_mask,
             feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
         if protect < 0.5 and pitch is not None and pitchf is not None:
             feats0 = feats.clone()
         if (
             index is not None
             and big_npy is not None
             npy = feats[0].cpu().numpy()
             if self.is_half:
                 npy = npy.astype("float32")
             score, ix = index.search(npy, k=8)
             weight = np.square(1 / score)
             weight /= weight.sum(axis=1, keepdims=True)
             npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
             if self.is_half:
                 npy = npy.astype("float16")
             feats = (
                 torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
                 + (1 - index_rate) * feats
             )
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
         if protect < 0.5 and pitch is not None and pitchf is not None:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
         t1 = ttime()
+        p_len = audio0.shape[0] // self.window
         if feats.shape[1] < p_len:
             p_len = feats.shape[1]
             if pitch is not None and pitchf is not None:
                 pitch = pitch[:, :p_len]
                 pitchf = pitchf[:, :p_len]
         if protect < 0.5 and pitch is not None and pitchf is not None:
             pitchff = pitchf.clone()
             pitchff[pitchf > 0] = 1
         t2 = ttime()
         times[0] += t1 - t0
         times[2] += t2 - t1
         return audio1
     def pipeline(
         self,
         model,
         protect,
         f0_file=None,
     ):
         if (
             file_index != ""
             and os.path.exists(file_index)
                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
                     )[0][0]
                 )
         s = 0
         audio_opt = []
         t = None
             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
         t2 = ttime()
         times[1] += t2 - t1
+        for t in opt_ts:
             t = t // self.window * self.window
             if if_f0 == 1:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+                        pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            else:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        None,
+                        None,
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            s = t
+        if if_f0 == 1:
+            audio_opt.append(
+                self.vc(
                     model,
                     net_g,
                     sid,
+                    audio_pad[t:],
+                    pitch[:, t // self.window :] if t is not None else pitch,
+                    pitchf[:, t // self.window :] if t is not None else pitchf,
                     times,
                     index,
                     big_npy,
                     version,
                     protect,
                 )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        else:
+            audio_opt.append(
+                self.vc(
                     model,
                     net_g,
                     sid,
+                    audio_pad[t:],
                     None,
                     None,
                     times,
                     version,
                     protect,
                 )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
         audio_opt = np.concatenate(audio_opt)
         if rms_mix_rate != 1:
             audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
         if audio_max > 1:
             max_int16 /= audio_max
         audio_opt = (audio_opt * max_int16).astype(np.int16)
+        del pitch, pitchf, sid
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return audio_opt
     def parallel_pipeline(self, tasks):
         with ThreadPoolExecutor() as executor:
             futures = [executor.submit(self.pipeline, *task) for task in tasks]
             results = [future.result() for future in futures]
+        return results