Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on about 10 hours ago

Commit

04fdc6c

1 Parent(s): aa53ba5

Fix MMAudio: load BigVGAN from local snapshot dir, not HF network

BigVGANv2.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x') was hitting
the HF network inside the ZeroGPU worker because AutoEncoderModule hardcoded
the repo ID string for 44k mode, ignoring the vocoder_ckpt_path argument.

Fixes:
- _dl_bigvgan() now returns the local snapshot dir from snapshot_download()
- bigvgan_local_dir captured at startup alongside other checkpoint paths
- _load_mmaudio_models passes bigvgan_local_dir as bigvgan_vocoder_ckpt
- AutoEncoderModule (44k path) uses vocoder_ckpt_path as the from_pretrained
source when it points to a local directory, falling back to the repo ID
string only when no local path is provided

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

MMAudio/mmaudio/ext/autoencoder/autoencoder.py +7 -2
app.py +9 -4

MMAudio/mmaudio/ext/autoencoder/autoencoder.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Literal, Optional
 import torch
@@ -27,8 +28,12 @@ class AutoEncoderModule(nn.Module):
             assert vocoder_ckpt_path is not None
             self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
         elif mode == '44k':
-            self.vocoder = BigVGANv2.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x',
-                                                     use_cuda_kernel=False)
             self.vocoder.remove_weight_norm()
         else:
             raise ValueError(f'Unknown mode: {mode}')

+import os
 from typing import Literal, Optional
 import torch
             assert vocoder_ckpt_path is not None
             self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
         elif mode == '44k':
+            # If vocoder_ckpt_path points to a local snapshot directory, use it
+            # directly to avoid a network fetch inside ZeroGPU workers.
+            bigvgan_src = vocoder_ckpt_path if (
+                vocoder_ckpt_path is not None and os.path.isdir(vocoder_ckpt_path)
+            ) else 'nvidia/bigvgan_v2_44khz_128band_512x'
+            self.vocoder = BigVGANv2.from_pretrained(bigvgan_src, use_cuda_kernel=False)
             self.vocoder.remove_weight_norm()
         else:
             raise ValueError(f'Unknown mode: {mode}')

app.py CHANGED Viewed

@@ -98,9 +98,13 @@ def _dl_audioldm2():
     print("AudioLDM2 pre-downloaded.")
 def _dl_bigvgan():
-    """Pre-download BigVGAN vocoder (~489 MB) used by MMAudio."""
-    snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
-    print("BigVGAN vocoder pre-downloaded.")
 print("[startup] Starting parallel checkpoint + model downloads…")
 _t_dl_start = time.perf_counter()
@@ -119,6 +123,7 @@ with ThreadPoolExecutor(max_workers=7) as _pool:
 cavp_ckpt_path, onset_ckpt_path, taro_ckpt_path = _fut_taro.result()
 mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.result()
 print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
 # ================================================================== #
@@ -380,7 +385,7 @@ def _load_mmaudio_models(device, dtype):
         tod_vae_ckpt=str(model_cfg.vae_path),
         synchformer_ckpt=str(model_cfg.synchformer_ckpt),
         enable_conditions=True, mode=model_cfg.mode,
-        bigvgan_vocoder_ckpt=None, need_vae_encoder=False,
     ).to(device, dtype).eval()
     return net, feature_utils, model_cfg, seq_cfg

     print("AudioLDM2 pre-downloaded.")
 def _dl_bigvgan():
+    """Pre-download BigVGAN vocoder (~489 MB) used by MMAudio.
+    Returns the local snapshot directory so _load_mmaudio_models can pass it
+    to BigVGANv2.from_pretrained() as a local path, avoiding a network hit
+    inside the ZeroGPU worker."""
+    local_dir = snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
+    print(f"BigVGAN vocoder pre-downloaded to {local_dir}.")
+    return local_dir
 print("[startup] Starting parallel checkpoint + model downloads…")
 _t_dl_start = time.perf_counter()
 cavp_ckpt_path, onset_ckpt_path, taro_ckpt_path = _fut_taro.result()
 mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.result()
+bigvgan_local_dir = _fut_bigvgan.result()
 print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
 # ================================================================== #
         tod_vae_ckpt=str(model_cfg.vae_path),
         synchformer_ckpt=str(model_cfg.synchformer_ckpt),
         enable_conditions=True, mode=model_cfg.mode,
+        bigvgan_vocoder_ckpt=bigvgan_local_dir, need_vae_encoder=False,
     ).to(device, dtype).eval()
     return net, feature_utils, model_cfg, seq_cfg