AudioLlama

Running on Zero

App Files Files Community

Rex Cheng commited on 5 days ago

Commit

164c335

•

1 Parent(s): 627e0b8

speed up inference

Browse files

Files changed (4) hide show

app.py +2 -1
mmaudio/eval_utils.py +20 -17
mmaudio/ext/autoencoder/autoencoder.py +5 -1
mmaudio/model/utils/features_utils.py +7 -5

app.py CHANGED Viewed

@@ -48,7 +48,8 @@ def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
                                   synchformer_ckpt=model.synchformer_ckpt,
                                   enable_conditions=True,
                                   mode=model.mode,
-                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path)
     feature_utils = feature_utils.to(device, dtype).eval()
     return net, feature_utils, seq_cfg

                                   synchformer_ckpt=model.synchformer_ckpt,
                                   enable_conditions=True,
                                   mode=model.mode,
+                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
+                                  need_vae_encoder=False)
     feature_utils = feature_utils.to(device, dtype).eval()
     return net, feature_utils, seq_cfg

mmaudio/eval_utils.py CHANGED Viewed

@@ -76,29 +76,37 @@ all_model_cfg: dict[str, ModelConfig] = {
 }
-def generate(clip_video: Optional[torch.Tensor],
-             sync_video: Optional[torch.Tensor],
-             text: Optional[list[str]],
-             *,
-             negative_text: Optional[list[str]] = None,
-             feature_utils: FeaturesUtils,
-             net: MMAudio,
-             fm: FlowMatching,
-             rng: torch.Generator,
-             cfg_strength: float):
     device = feature_utils.device
     dtype = feature_utils.dtype
     bs = len(text)
     if clip_video is not None:
         clip_video = clip_video.to(device, dtype, non_blocking=True)
-        clip_features = feature_utils.encode_video_with_clip(clip_video, batch_size=bs)
     else:
         clip_features = net.get_empty_clip_sequence(bs)
     if sync_video is not None:
         sync_video = sync_video.to(device, dtype, non_blocking=True)
-        sync_features = feature_utils.encode_video_with_sync(sync_video, batch_size=bs)
     else:
         sync_features = net.get_empty_sync_sequence(bs)
@@ -185,14 +193,9 @@ def load_video(video_path: Path, duration_sec: float) -> tuple[torch.Tensor, tor
     data_chunk = reader.pop_chunks()
     clip_chunk = data_chunk[0]
     sync_chunk = data_chunk[1]
-    print('clip', clip_chunk.shape, clip_chunk.dtype, clip_chunk.max())
-    print('sync', sync_chunk.shape, sync_chunk.dtype, sync_chunk.max())
     assert clip_chunk is not None
     assert sync_chunk is not None
-    for i in range(reader.num_out_streams):
-        print(reader.get_out_stream_info(i))
     clip_frames = clip_transform(clip_chunk)
     sync_frames = sync_transform(sync_chunk)

 }
+def generate(
+    clip_video: Optional[torch.Tensor],
+    sync_video: Optional[torch.Tensor],
+    text: Optional[list[str]],
+    *,
+    negative_text: Optional[list[str]] = None,
+    feature_utils: FeaturesUtils,
+    net: MMAudio,
+    fm: FlowMatching,
+    rng: torch.Generator,
+    cfg_strength: float,
+    clip_batch_size_multiplier: int = 40,
+    sync_batch_size_multiplier: int = 40,
+) -> torch.Tensor:
     device = feature_utils.device
     dtype = feature_utils.dtype
     bs = len(text)
     if clip_video is not None:
         clip_video = clip_video.to(device, dtype, non_blocking=True)
+        clip_features = feature_utils.encode_video_with_clip(clip_video,
+                                                             batch_size=bs *
+                                                             clip_batch_size_multiplier)
     else:
         clip_features = net.get_empty_clip_sequence(bs)
     if sync_video is not None:
         sync_video = sync_video.to(device, dtype, non_blocking=True)
+        sync_features = feature_utils.encode_video_with_sync(sync_video,
+                                                             batch_size=bs *
+                                                             sync_batch_size_multiplier)
     else:
         sync_features = net.get_empty_sync_sequence(bs)
     data_chunk = reader.pop_chunks()
     clip_chunk = data_chunk[0]
     sync_chunk = data_chunk[1]
     assert clip_chunk is not None
     assert sync_chunk is not None
     clip_frames = clip_transform(clip_chunk)
     sync_frames = sync_transform(sync_chunk)

mmaudio/ext/autoencoder/autoencoder.py CHANGED Viewed

@@ -15,7 +15,8 @@ class AutoEncoderModule(nn.Module):
                  *,
                  vae_ckpt_path,
                  vocoder_ckpt_path: Optional[str] = None,
-                 mode: Literal['16k', '44k']):
         super().__init__()
         self.vae: VAE = get_my_vae(mode).eval()
         vae_state_dict = torch.load(vae_ckpt_path, weights_only=True, map_location='cpu')
@@ -35,6 +36,9 @@ class AutoEncoderModule(nn.Module):
         for param in self.parameters():
             param.requires_grad = False
     @torch.inference_mode()
     def encode(self, x: torch.Tensor) -> DiagonalGaussianDistribution:
         return self.vae.encode(x)

                  *,
                  vae_ckpt_path,
                  vocoder_ckpt_path: Optional[str] = None,
+                 mode: Literal['16k', '44k'],
+                 need_vae_encoder: bool = True):
         super().__init__()
         self.vae: VAE = get_my_vae(mode).eval()
         vae_state_dict = torch.load(vae_ckpt_path, weights_only=True, map_location='cpu')
         for param in self.parameters():
             param.requires_grad = False
+        if not need_vae_encoder:
+            del self.vae.encoder
     @torch.inference_mode()
     def encode(self, x: torch.Tensor) -> DiagonalGaussianDistribution:
         return self.vae.encode(x)

mmaudio/model/utils/features_utils.py CHANGED Viewed

@@ -41,6 +41,7 @@ class FeaturesUtils(nn.Module):
         synchformer_ckpt: Optional[str] = None,
         enable_conditions: bool = True,
         mode=Literal['16k', '44k'],
     ):
         super().__init__()
@@ -64,19 +65,18 @@ class FeaturesUtils(nn.Module):
         if tod_vae_ckpt is not None:
             self.tod = AutoEncoderModule(vae_ckpt_path=tod_vae_ckpt,
                                          vocoder_ckpt_path=bigvgan_vocoder_ckpt,
-                                         mode=mode)
         else:
             self.tod = None
         self.mel_converter = MelConverter()
     def compile(self):
         if self.clip_model is not None:
-            self.encode_video_with_clip = torch.compile(self.encode_video_with_clip)
             self.clip_model.encode_image = torch.compile(self.clip_model.encode_image)
             self.clip_model.encode_text = torch.compile(self.clip_model.encode_text)
         if self.synchformer is not None:
             self.synchformer = torch.compile(self.synchformer)
-        self.tod.encode = torch.compile(self.tod.encode)
         self.decode = torch.compile(self.decode)
         self.vocode = torch.compile(self.vocode)
@@ -121,9 +121,11 @@ class FeaturesUtils(nn.Module):
         outputs = []
         if batch_size < 0:
             batch_size = b
-        for i in range(0, b, batch_size):
             outputs.append(self.synchformer(x[i:i + batch_size]))
-        x = torch.cat(outputs, dim=0).flatten(start_dim=1, end_dim=2)
         return x
     @torch.inference_mode()

         synchformer_ckpt: Optional[str] = None,
         enable_conditions: bool = True,
         mode=Literal['16k', '44k'],
+        need_vae_encoder: bool = True,
     ):
         super().__init__()
         if tod_vae_ckpt is not None:
             self.tod = AutoEncoderModule(vae_ckpt_path=tod_vae_ckpt,
                                          vocoder_ckpt_path=bigvgan_vocoder_ckpt,
+                                         mode=mode,
+                                         need_vae_encoder=need_vae_encoder)
         else:
             self.tod = None
         self.mel_converter = MelConverter()
     def compile(self):
         if self.clip_model is not None:
             self.clip_model.encode_image = torch.compile(self.clip_model.encode_image)
             self.clip_model.encode_text = torch.compile(self.clip_model.encode_text)
         if self.synchformer is not None:
             self.synchformer = torch.compile(self.synchformer)
         self.decode = torch.compile(self.decode)
         self.vocode = torch.compile(self.vocode)
         outputs = []
         if batch_size < 0:
             batch_size = b
+        x = rearrange(x, 'b s t c h w -> (b s) 1 t c h w')
+        for i in range(0, b * num_segments, batch_size):
             outputs.append(self.synchformer(x[i:i + batch_size]))
+        x = torch.cat(outputs, dim=0)
+        x = rearrange(x, '(b s) 1 t d -> b (s t) d', b=b)
         return x
     @torch.inference_mode()