Spaces:
Running on Zero
Running on Zero
Commit ·
04fdc6c
1
Parent(s): aa53ba5
Fix MMAudio: load BigVGAN from local snapshot dir, not HF network
Browse filesBigVGANv2.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x') was hitting
the HF network inside the ZeroGPU worker because AutoEncoderModule hardcoded
the repo ID string for 44k mode, ignoring the vocoder_ckpt_path argument.
Fixes:
- _dl_bigvgan() now returns the local snapshot dir from snapshot_download()
- bigvgan_local_dir captured at startup alongside other checkpoint paths
- _load_mmaudio_models passes bigvgan_local_dir as bigvgan_vocoder_ckpt
- AutoEncoderModule (44k path) uses vocoder_ckpt_path as the from_pretrained
source when it points to a local directory, falling back to the repo ID
string only when no local path is provided
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- MMAudio/mmaudio/ext/autoencoder/autoencoder.py +7 -2
- app.py +9 -4
MMAudio/mmaudio/ext/autoencoder/autoencoder.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from typing import Literal, Optional
|
| 2 |
|
| 3 |
import torch
|
|
@@ -27,8 +28,12 @@ class AutoEncoderModule(nn.Module):
|
|
| 27 |
assert vocoder_ckpt_path is not None
|
| 28 |
self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
|
| 29 |
elif mode == '44k':
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
self.vocoder.remove_weight_norm()
|
| 33 |
else:
|
| 34 |
raise ValueError(f'Unknown mode: {mode}')
|
|
|
|
| 1 |
+
import os
|
| 2 |
from typing import Literal, Optional
|
| 3 |
|
| 4 |
import torch
|
|
|
|
| 28 |
assert vocoder_ckpt_path is not None
|
| 29 |
self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
|
| 30 |
elif mode == '44k':
|
| 31 |
+
# If vocoder_ckpt_path points to a local snapshot directory, use it
|
| 32 |
+
# directly to avoid a network fetch inside ZeroGPU workers.
|
| 33 |
+
bigvgan_src = vocoder_ckpt_path if (
|
| 34 |
+
vocoder_ckpt_path is not None and os.path.isdir(vocoder_ckpt_path)
|
| 35 |
+
) else 'nvidia/bigvgan_v2_44khz_128band_512x'
|
| 36 |
+
self.vocoder = BigVGANv2.from_pretrained(bigvgan_src, use_cuda_kernel=False)
|
| 37 |
self.vocoder.remove_weight_norm()
|
| 38 |
else:
|
| 39 |
raise ValueError(f'Unknown mode: {mode}')
|
app.py
CHANGED
|
@@ -98,9 +98,13 @@ def _dl_audioldm2():
|
|
| 98 |
print("AudioLDM2 pre-downloaded.")
|
| 99 |
|
| 100 |
def _dl_bigvgan():
|
| 101 |
-
"""Pre-download BigVGAN vocoder (~489 MB) used by MMAudio.
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
print("[startup] Starting parallel checkpoint + model downloads…")
|
| 106 |
_t_dl_start = time.perf_counter()
|
|
@@ -119,6 +123,7 @@ with ThreadPoolExecutor(max_workers=7) as _pool:
|
|
| 119 |
|
| 120 |
cavp_ckpt_path, onset_ckpt_path, taro_ckpt_path = _fut_taro.result()
|
| 121 |
mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.result()
|
|
|
|
| 122 |
print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
|
| 123 |
|
| 124 |
# ================================================================== #
|
|
@@ -380,7 +385,7 @@ def _load_mmaudio_models(device, dtype):
|
|
| 380 |
tod_vae_ckpt=str(model_cfg.vae_path),
|
| 381 |
synchformer_ckpt=str(model_cfg.synchformer_ckpt),
|
| 382 |
enable_conditions=True, mode=model_cfg.mode,
|
| 383 |
-
bigvgan_vocoder_ckpt=
|
| 384 |
).to(device, dtype).eval()
|
| 385 |
return net, feature_utils, model_cfg, seq_cfg
|
| 386 |
|
|
|
|
| 98 |
print("AudioLDM2 pre-downloaded.")
|
| 99 |
|
| 100 |
def _dl_bigvgan():
|
| 101 |
+
"""Pre-download BigVGAN vocoder (~489 MB) used by MMAudio.
|
| 102 |
+
Returns the local snapshot directory so _load_mmaudio_models can pass it
|
| 103 |
+
to BigVGANv2.from_pretrained() as a local path, avoiding a network hit
|
| 104 |
+
inside the ZeroGPU worker."""
|
| 105 |
+
local_dir = snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
|
| 106 |
+
print(f"BigVGAN vocoder pre-downloaded to {local_dir}.")
|
| 107 |
+
return local_dir
|
| 108 |
|
| 109 |
print("[startup] Starting parallel checkpoint + model downloads…")
|
| 110 |
_t_dl_start = time.perf_counter()
|
|
|
|
| 123 |
|
| 124 |
cavp_ckpt_path, onset_ckpt_path, taro_ckpt_path = _fut_taro.result()
|
| 125 |
mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.result()
|
| 126 |
+
bigvgan_local_dir = _fut_bigvgan.result()
|
| 127 |
print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
|
| 128 |
|
| 129 |
# ================================================================== #
|
|
|
|
| 385 |
tod_vae_ckpt=str(model_cfg.vae_path),
|
| 386 |
synchformer_ckpt=str(model_cfg.synchformer_ckpt),
|
| 387 |
enable_conditions=True, mode=model_cfg.mode,
|
| 388 |
+
bigvgan_vocoder_ckpt=bigvgan_local_dir, need_vae_encoder=False,
|
| 389 |
).to(device, dtype).eval()
|
| 390 |
return net, feature_utils, model_cfg, seq_cfg
|
| 391 |
|