Seed-VC

Running

App Files Files Community

Plachta commited on Apr 18

Commit

84a7891

verified ·

1 Parent(s): 5e78e49

Update modules/v2/vc_wrapper.py

Browse files

Files changed (1) hide show

modules/v2/vc_wrapper.py +21 -80

modules/v2/vc_wrapper.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 import librosa
 import torchaudio
@@ -52,56 +53,6 @@ class VoiceConversionWrapper(torch.nn.Module):
         self.ar_max_content_len = 1500  # in num of narrow tokens
         self.compile_len = 87 * self.dit_max_context_len
-    def forward_cfm(self, content_indices_wide, content_lens, mels, mel_lens, style_vectors):
-        device = content_indices_wide.device
-        B = content_indices_wide.size(0)
-        cond, _ = self.cfm_length_regulator(content_indices_wide, ylens=mel_lens)
-        # randomly set a length as prompt
-        prompt_len_max = mel_lens - 1
-        prompt_len = (torch.rand([B], device=device) * prompt_len_max).floor().to(dtype=torch.long)
-        prompt_len[torch.rand([B], device=device) < 0.1] = 0
-        loss = self.cfm(mels, mel_lens, prompt_len, cond, style_vectors)
-        return loss
-    def forward_ar(self, content_indices_narrow, content_indices_wide, content_lens):
-        device = content_indices_narrow.device
-        duration_reduced_narrow_tokens = []
-        duration_reduced_narrow_lens = []
-        for bib in range(content_indices_narrow.size(0)):
-            reduced, reduced_len = self.duration_reduction_func(content_indices_narrow[bib])
-            duration_reduced_narrow_tokens.append(reduced)
-            duration_reduced_narrow_lens.append(reduced_len)
-        duration_reduced_narrow_tokens = torch.nn.utils.rnn.pad_sequence(duration_reduced_narrow_tokens,
-            batch_first=True, padding_value=0).to(device)
-        duration_reduced_narrow_lens = torch.LongTensor(duration_reduced_narrow_lens).to(device)
-        # interpolate speech token to match acoustic feature length
-        cond, _ = self.ar_length_regulator(duration_reduced_narrow_tokens)
-        loss = self.ar(cond, duration_reduced_narrow_lens, content_indices_wide, content_lens)
-        return loss
-    def forward(self, waves_16k, mels, wave_lens_16k, mel_lens, forward_ar=False, forward_cfm=True):
-        """
-        Forward pass for the model.
-        """
-        # extract wide content features as both AR and CFM models use them
-        with torch.no_grad():
-            _, content_indices_wide, content_lens = self.content_extractor_wide(waves_16k, wave_lens_16k)
-        if forward_ar:
-            # extract narrow content features for AR model
-            _, content_indices_narrow, _ = self.content_extractor_narrow(waves_16k, wave_lens_16k, ssl_model=self.content_extractor_wide.ssl_model)
-            loss_ar = self.forward_ar(content_indices_narrow.clone(), content_indices_wide.clone(), content_lens)
-        else:
-            loss_ar = torch.tensor(0.0, device=waves_16k.device, dtype=waves_16k.dtype)
-        if forward_cfm:
-            style_vectors = self.compute_style(waves_16k, wave_lens_16k)
-            loss_cfm = self.forward_cfm(content_indices_wide, content_lens, mels, mel_lens, style_vectors)
-        else:
-            loss_cfm = torch.tensor(0.0, device=waves_16k.device, dtype=waves_16k.dtype)
-        return loss_ar, loss_cfm
     def compile_ar(self):
         """
         Compile the AR model for inference.
@@ -258,28 +209,24 @@ class VoiceConversionWrapper(torch.nn.Module):
                 repo_id=DEFAULT_REPO_ID,
                 model_filename=DEFAULT_CFM_CHECKPOINT,
             )
-        else:
-            print(f"Loading CFM checkpoint from {cfm_checkpoint_path}...")
         if ar_checkpoint_path is None:
             ar_checkpoint_path = load_custom_model_from_hf(
                 repo_id=DEFAULT_REPO_ID,
                 model_filename=DEFAULT_AR_CHECKPOINT,
             )
-        else:
-            print(f"Loading AR checkpoint from {ar_checkpoint_path}...")
         # cfm
         cfm_checkpoint = torch.load(cfm_checkpoint_path, map_location="cpu")
         cfm_length_regulator_state_dict = self.strip_prefix(cfm_checkpoint["net"]['length_regulator'], "module.")
         cfm_state_dict = self.strip_prefix(cfm_checkpoint["net"]['cfm'], "module.")
-        missing_keys, unexpected_keys = self.cfm.load_state_dict(cfm_state_dict, strict=False)
-        missing_keys, unexpected_keys = self.cfm_length_regulator.load_state_dict(cfm_length_regulator_state_dict, strict=False)
         # ar
         ar_checkpoint = torch.load(ar_checkpoint_path, map_location="cpu")
         ar_length_regulator_state_dict = self.strip_prefix(ar_checkpoint["net"]['length_regulator'], "module.")
         ar_state_dict = self.strip_prefix(ar_checkpoint["net"]['ar'], "module.")
-        missing_keys, unexpected_keys = self.ar.load_state_dict(ar_state_dict, strict=False)
-        missing_keys, unexpected_keys = self.ar_length_regulator.load_state_dict(ar_length_regulator_state_dict, strict=False)
         # content extractor
         content_extractor_narrow_checkpoint_path = load_custom_model_from_hf(
@@ -308,26 +255,13 @@ class VoiceConversionWrapper(torch.nn.Module):
     def setup_ar_caches(self, max_batch_size=1, max_seq_len=4096, dtype=torch.float32, device=torch.device("cpu")):
         self.ar.setup_caches(max_batch_size=max_batch_size, max_seq_len=max_seq_len, dtype=dtype, device=device)
-    @torch.no_grad()
-    def compute_style(self, waves_16k: torch.Tensor, wave_lens_16k: torch.Tensor = None):
-        if wave_lens_16k is None:
-            wave_lens_16k = torch.tensor([waves_16k.size(-1)], dtype=torch.int32).to(waves_16k.device)
-        feat_list = []
-        for bib in range(waves_16k.size(0)):
-            feat = torchaudio.compliance.kaldi.fbank(waves_16k[bib:bib + 1, :wave_lens_16k[bib]],
-                               num_mel_bins=80,
-                               dither=0,
-                               sample_frequency=16000)
-            feat = feat - feat.mean(dim=0, keepdim=True)
-            feat_list.append(feat)
-        max_feat_len = max([feat.size(0) for feat in feat_list])
-        feat_lens = torch.tensor([feat.size(0) for feat in feat_list], dtype=torch.int32).to(waves_16k.device) // 2
-        feat_list = [
-            torch.nn.functional.pad(feat, (0, 0, 0, max_feat_len - feat.size(0)), value=float(feat.min().item()))
-            for feat in feat_list
-        ]
-        feat = torch.stack(feat_list, dim=0)
-        style = self.style_encoder(feat, feat_lens)
         return style
     @torch.no_grad()
@@ -490,6 +424,7 @@ class VoiceConversionWrapper(torch.nn.Module):
         return content_indices
     @torch.no_grad()
     @torch.inference_mode()
     def convert_voice_with_streaming(
@@ -623,7 +558,10 @@ class VoiceConversionWrapper(torch.nn.Module):
                 if stream_output and mp3_bytes is not None:
                     yield mp3_bytes, full_audio
                 if should_break:
                     break
         else:
             cond, _ = self.cfm_length_regulator(source_content_indices, ylens=torch.LongTensor([source_mel_len]).to(device))
@@ -641,7 +579,7 @@ class VoiceConversionWrapper(torch.nn.Module):
                 if self.dit_compiled:
                     cat_condition = torch.nn.functional.pad(cat_condition,
                                                             (0, 0, 0, self.compile_len - cat_condition.size(1),), value=0)
-                with torch.autocast(device_type=device.type, dtype=torch.float32):  # force CFM to use float32
                     # Voice Conversion
                     vc_mel = self.cfm.inference(
                         cat_condition,
@@ -660,5 +598,8 @@ class VoiceConversionWrapper(torch.nn.Module):
                 if stream_output and mp3_bytes is not None:
                     yield mp3_bytes, full_audio
                 if should_break:
-                    break

+import spaces
 import torch
 import librosa
 import torchaudio
         self.ar_max_content_len = 1500  # in num of narrow tokens
         self.compile_len = 87 * self.dit_max_context_len
     def compile_ar(self):
         """
         Compile the AR model for inference.
                 repo_id=DEFAULT_REPO_ID,
                 model_filename=DEFAULT_CFM_CHECKPOINT,
             )
         if ar_checkpoint_path is None:
             ar_checkpoint_path = load_custom_model_from_hf(
                 repo_id=DEFAULT_REPO_ID,
                 model_filename=DEFAULT_AR_CHECKPOINT,
             )
         # cfm
         cfm_checkpoint = torch.load(cfm_checkpoint_path, map_location="cpu")
         cfm_length_regulator_state_dict = self.strip_prefix(cfm_checkpoint["net"]['length_regulator'], "module.")
         cfm_state_dict = self.strip_prefix(cfm_checkpoint["net"]['cfm'], "module.")
+        self.cfm.load_state_dict(cfm_state_dict, strict=False)
+        self.cfm_length_regulator.load_state_dict(cfm_length_regulator_state_dict, strict=False)
         # ar
         ar_checkpoint = torch.load(ar_checkpoint_path, map_location="cpu")
         ar_length_regulator_state_dict = self.strip_prefix(ar_checkpoint["net"]['length_regulator'], "module.")
         ar_state_dict = self.strip_prefix(ar_checkpoint["net"]['ar'], "module.")
+        self.ar.load_state_dict(ar_state_dict, strict=False)
+        self.ar_length_regulator.load_state_dict(ar_length_regulator_state_dict, strict=False)
         # content extractor
         content_extractor_narrow_checkpoint_path = load_custom_model_from_hf(
     def setup_ar_caches(self, max_batch_size=1, max_seq_len=4096, dtype=torch.float32, device=torch.device("cpu")):
         self.ar.setup_caches(max_batch_size=max_batch_size, max_seq_len=max_seq_len, dtype=dtype, device=device)
+    def compute_style(self, waves_16k: torch.Tensor):
+        feat = torchaudio.compliance.kaldi.fbank(waves_16k,
+                                                  num_mel_bins=80,
+                                                  dither=0,
+                                                  sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        style = self.style_encoder(feat.unsqueeze(0))
         return style
     @torch.no_grad()
         return content_indices
+    @spaces.GPU
     @torch.no_grad()
     @torch.inference_mode()
     def convert_voice_with_streaming(
                 if stream_output and mp3_bytes is not None:
                     yield mp3_bytes, full_audio
                 if should_break:
+                    if not stream_output:
+                        return full_audio
                     break
         else:
             cond, _ = self.cfm_length_regulator(source_content_indices, ylens=torch.LongTensor([source_mel_len]).to(device))
                 if self.dit_compiled:
                     cat_condition = torch.nn.functional.pad(cat_condition,
                                                             (0, 0, 0, self.compile_len - cat_condition.size(1),), value=0)
+                with torch.autocast(device_type=device.type, dtype=dtype):
                     # Voice Conversion
                     vc_mel = self.cfm.inference(
                         cat_condition,
                 if stream_output and mp3_bytes is not None:
                     yield mp3_bytes, full_audio
                 if should_break:
+                    if not stream_output:
+                        return full_audio
+                    break