vocos-bark

Sleeping

App Files Files Community

ylacombe HF staff commited on Oct 13, 2023

Commit

2ba1b0d

•

1 Parent(s): 9f58137

Update vocos_bark.py

Browse files

Files changed (1) hide show

vocos_bark.py +14 -19

vocos_bark.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from vocos import Vocos
 from typing import Dict, Optional, Tuple, Union
 from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel, BarkPreTrainedModel
@@ -7,11 +6,10 @@ from transformers.models.bark.generation_configuration_bark import (
     BarkFineGenerationConfig,
     BarkSemanticGenerationConfig,
 )
-from transformers import BarkConfig
 from transformers.modeling_utils import get_parameter_device
 from transformers.utils import (
     is_accelerate_available,
 )
 import torch
@@ -26,7 +24,8 @@ class BarkModel(BarkPreTrainedModel):
         self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
         self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
-        self.vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2")
         self.config = config
     @property
@@ -81,12 +80,20 @@ class BarkModel(BarkPreTrainedModel):
         self.fine_acoustics_hook = hook
-        _, hook = cpu_offload_with_hook(self.vocos, device, prev_module_hook=hook)
         # We'll offload the last model manually.
         self.codec_model_hook = hook
     @torch.no_grad()
     def generate(
@@ -197,18 +204,6 @@ class BarkModel(BarkPreTrainedModel):
             # and load codec_model to GPU
             # since bark doesn't use codec_model forward pass
             self.fine_acoustics_hook.offload()
-            self.vocos = self.vocos.to(self.device)
-        # 4. Decode the output and generate audio array
-        bandwidth_id = torch.tensor([2]).to(self.device)
-        # transpose
-        value = output.transpose(0,1)
-        value = self.vocos.codes_to_features(value)
-        value = self.vocos.decode(value, bandwidth_id=bandwidth_id)
-        if getattr(self, "codec_model_hook", None) is not None:
-            # Offload codec_model to CPU
-            self.vocos.offload()
-        return value

 from typing import Dict, Optional, Tuple, Union
 from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel, BarkPreTrainedModel
     BarkFineGenerationConfig,
     BarkSemanticGenerationConfig,
 )
+from transformers import BarkConfig, AutoModel
 from transformers.modeling_utils import get_parameter_device
 from transformers.utils import (
     is_accelerate_available,
 )
 import torch
         self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
         self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
+        self.codec_model = AutoModel.from_config(config.codec_config)
         self.config = config
     @property
         self.fine_acoustics_hook = hook
+        _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
         # We'll offload the last model manually.
         self.codec_model_hook = hook
+    def codec_decode(self, fine_output):
+        """Turn quantized audio codes into audio array using encodec."""
+        fine_output = fine_output.transpose(0, 1)
+        emb = self.codec_model.quantizer.decode(fine_output)
+        out = self.codec_model.decoder(emb)
+        audio_arr = out.squeeze(1)  # squeeze the codebook dimension
+        return audio_arr
     @torch.no_grad()
     def generate(
             # and load codec_model to GPU
             # since bark doesn't use codec_model forward pass
             self.fine_acoustics_hook.offload()
+            self.codec_model = self.codec_model.to(self.device)
+        return output