Spaces:
Runtime error
Runtime error
change to bigvan
Browse files- audioldm/__pycache__/__init__.cpython-310.pyc +0 -0
- audioldm/__pycache__/ldm.cpython-310.pyc +0 -0
- audioldm/__pycache__/utils.cpython-310.pyc +0 -0
- audioldm/bigvgan/__init__.py +22 -0
- audioldm/bigvgan/__pycache__/__init__.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc +0 -0
- audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc +0 -0
- audioldm/ldm.py +6 -4
- audioldm/pipeline.py +7 -0
- audioldm/utils.py +2 -2
- audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc +0 -0
- audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc +0 -0
- audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc +0 -0
- audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc +0 -0
- audioldm/variational_autoencoder/autoencoder.py +9 -3
- requirements.txt +2 -1
audioldm/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/audioldm/__pycache__/__init__.cpython-310.pyc and b/audioldm/__pycache__/__init__.cpython-310.pyc differ
|
|
audioldm/__pycache__/ldm.cpython-310.pyc
CHANGED
Binary files a/audioldm/__pycache__/ldm.cpython-310.pyc and b/audioldm/__pycache__/ldm.cpython-310.pyc differ
|
|
audioldm/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/audioldm/__pycache__/utils.cpython-310.pyc and b/audioldm/__pycache__/utils.cpython-310.pyc differ
|
|
audioldm/bigvgan/__init__.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import bigvgan
|
3 |
+
from huggingface_hub import hf_hub_download
|
4 |
+
|
5 |
+
class BigVGANVocoder:
|
6 |
+
def __init__(self, device='cuda', use_cuda_kernel=False):
|
7 |
+
# Load the pretrained model
|
8 |
+
self.model = bigvgan.BigVGAN.from_pretrained(
|
9 |
+
'nvidia/bigvgan_v2_44khz_128band_512x',
|
10 |
+
use_cuda_kernel=use_cuda_kernel
|
11 |
+
)
|
12 |
+
self.model.remove_weight_norm()
|
13 |
+
self.model.eval().to(device)
|
14 |
+
self.device = device
|
15 |
+
self.h = self.model.h # This holds config like sampling_rate, etc.
|
16 |
+
|
17 |
+
@torch.no_grad()
|
18 |
+
def infer_waveform(self, mel):
|
19 |
+
# mel shape: [B, n_mels, T], BigVGAN expects mel at model.h.n_mels, typically 128
|
20 |
+
mel = mel.to(self.device)
|
21 |
+
wav_gen = self.model(mel)
|
22 |
+
return wav_gen.squeeze(1) # Returns [B, T]
|
audioldm/bigvgan/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (1.03 kB). View file
|
|
audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc differ
|
|
audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc
CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc differ
|
|
audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc
CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc differ
|
|
audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc
CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc differ
|
|
audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc
CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc differ
|
|
audioldm/ldm.py
CHANGED
@@ -234,14 +234,16 @@ class LatentDiffusion(DDPM):
|
|
234 |
return self.first_stage_model.decode(z)
|
235 |
|
236 |
def mel_spectrogram_to_waveform(self, mel):
|
237 |
-
#
|
|
|
238 |
if len(mel.size()) == 4:
|
239 |
-
mel = mel.squeeze(1)
|
240 |
-
mel = mel.permute(0, 2, 1)
|
241 |
-
waveform = self.
|
242 |
waveform = waveform.cpu().detach().numpy()
|
243 |
return waveform
|
244 |
|
|
|
245 |
@torch.no_grad()
|
246 |
def encode_first_stage(self, x):
|
247 |
return self.first_stage_model.encode(x)
|
|
|
234 |
return self.first_stage_model.decode(z)
|
235 |
|
236 |
def mel_spectrogram_to_waveform(self, mel):
|
237 |
+
# Originally: self.first_stage_model.vocoder(mel)
|
238 |
+
# We'll call BigVGAN here instead.
|
239 |
if len(mel.size()) == 4:
|
240 |
+
mel = mel.squeeze(1) # shape: [B, time, n_mels]
|
241 |
+
mel = mel.permute(0, 2, 1) # BigVGAN expects [B, n_mels, T]
|
242 |
+
waveform = self.vocoder.infer_waveform(mel) # using BigVGAN vocoder now
|
243 |
waveform = waveform.cpu().detach().numpy()
|
244 |
return waveform
|
245 |
|
246 |
+
|
247 |
@torch.no_grad()
|
248 |
def encode_first_stage(self, x):
|
249 |
return self.first_stage_model.encode(x)
|
audioldm/pipeline.py
CHANGED
@@ -10,7 +10,11 @@ from audioldm import LatentDiffusion, seed_everything
|
|
10 |
from audioldm.utils import default_audioldm_config, get_duration, get_bit_depth, get_metadata, download_checkpoint
|
11 |
from audioldm.audio import wav_to_fbank, TacotronSTFT, read_wav_file
|
12 |
from audioldm.latent_diffusion.ddim import DDIMSampler
|
|
|
13 |
from einops import repeat
|
|
|
|
|
|
|
14 |
import os
|
15 |
|
16 |
def make_batch_for_text_to_audio(text, waveform=None, fbank=None, batchsize=1):
|
@@ -89,6 +93,9 @@ def build_model(
|
|
89 |
latent_diffusion = latent_diffusion.to(device)
|
90 |
|
91 |
latent_diffusion.cond_stage_model.embed_mode = "text"
|
|
|
|
|
|
|
92 |
return latent_diffusion
|
93 |
|
94 |
def duration_to_latent_t_size(duration):
|
|
|
10 |
from audioldm.utils import default_audioldm_config, get_duration, get_bit_depth, get_metadata, download_checkpoint
|
11 |
from audioldm.audio import wav_to_fbank, TacotronSTFT, read_wav_file
|
12 |
from audioldm.latent_diffusion.ddim import DDIMSampler
|
13 |
+
from audioldm.bigvgan import BigVGANVocoder
|
14 |
from einops import repeat
|
15 |
+
from scipy.signal import convolve
|
16 |
+
import numpy as np
|
17 |
+
|
18 |
import os
|
19 |
|
20 |
def make_batch_for_text_to_audio(text, waveform=None, fbank=None, batchsize=1):
|
|
|
93 |
latent_diffusion = latent_diffusion.to(device)
|
94 |
|
95 |
latent_diffusion.cond_stage_model.embed_mode = "text"
|
96 |
+
# Here is where you add the BigVGAN vocoder initialization
|
97 |
+
|
98 |
+
latent_diffusion.vocoder = BigVGANVocoder(device='cuda', use_cuda_kernel=False)
|
99 |
return latent_diffusion
|
100 |
|
101 |
def duration_to_latent_t_size(duration):
|
audioldm/utils.py
CHANGED
@@ -106,10 +106,10 @@ def default_audioldm_config(model_name="audioldm-s-full"):
|
|
106 |
"root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
|
107 |
},
|
108 |
"preprocessing": {
|
109 |
-
"audio": {"sampling_rate":
|
110 |
"stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
|
111 |
"mel": {
|
112 |
-
"n_mel_channels":
|
113 |
"mel_fmin": 0,
|
114 |
"mel_fmax": 8000,
|
115 |
"freqm": 0,
|
|
|
106 |
"root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
|
107 |
},
|
108 |
"preprocessing": {
|
109 |
+
"audio": {"sampling_rate": 44100, "max_wav_value": 32768},
|
110 |
"stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
|
111 |
"mel": {
|
112 |
+
"n_mel_channels": 128,
|
113 |
"mel_fmin": 0,
|
114 |
"mel_fmax": 8000,
|
115 |
"freqm": 0,
|
audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc and b/audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc differ
|
|
audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc
CHANGED
Binary files a/audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc and b/audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc differ
|
|
audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc
CHANGED
Binary files a/audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc and b/audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc differ
|
|
audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc
CHANGED
Binary files a/audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc and b/audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc differ
|
|
audioldm/variational_autoencoder/autoencoder.py
CHANGED
@@ -3,7 +3,11 @@ from audioldm.latent_diffusion.ema import *
|
|
3 |
from audioldm.variational_autoencoder.modules import Encoder, Decoder
|
4 |
from audioldm.variational_autoencoder.distributions import DiagonalGaussianDistribution
|
5 |
|
6 |
-
from audioldm.hifigan.utilities import get_vocoder, vocoder_infer
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
class AutoencoderKL(nn.Module):
|
@@ -36,7 +40,8 @@ class AutoencoderKL(nn.Module):
|
|
36 |
self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
|
37 |
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
38 |
|
39 |
-
self.vocoder =
|
|
|
40 |
self.embed_dim = embed_dim
|
41 |
|
42 |
if monitor is not None:
|
@@ -65,7 +70,8 @@ class AutoencoderKL(nn.Module):
|
|
65 |
|
66 |
def decode_to_waveform(self, dec):
|
67 |
dec = dec.squeeze(1).permute(0, 2, 1)
|
68 |
-
wav_reconstruction =
|
|
|
69 |
return wav_reconstruction
|
70 |
|
71 |
def forward(self, input, sample_posterior=True):
|
|
|
3 |
from audioldm.variational_autoencoder.modules import Encoder, Decoder
|
4 |
from audioldm.variational_autoencoder.distributions import DiagonalGaussianDistribution
|
5 |
|
6 |
+
#from audioldm.hifigan.utilities import get_vocoder, vocoder_infer
|
7 |
+
from audioldm.bigvgan import BigVGANVocoder
|
8 |
+
|
9 |
+
# After you create your latent_diffusion instance:
|
10 |
+
latent_diffusion.vocoder = BigVGANVocoder(device='cuda', use_cuda_kernel=False)
|
11 |
|
12 |
|
13 |
class AutoencoderKL(nn.Module):
|
|
|
40 |
self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
|
41 |
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
42 |
|
43 |
+
self.vocoder = BigVGANVocoder(device=device)
|
44 |
+
#self.vocoder = get_vocoder(None, "cpu")
|
45 |
self.embed_dim = embed_dim
|
46 |
|
47 |
if monitor is not None:
|
|
|
70 |
|
71 |
def decode_to_waveform(self, dec):
|
72 |
dec = dec.squeeze(1).permute(0, 2, 1)
|
73 |
+
wav_reconstruction = vocoder.infer_waveform(mels)
|
74 |
+
#wav_reconstruction = vocoder_infer(dec, self.vocoder)
|
75 |
return wav_reconstruction
|
76 |
|
77 |
def forward(self, input, sample_posterior=True):
|
requirements.txt
CHANGED
@@ -29,4 +29,5 @@ tqdm==4.63.1
|
|
29 |
wandb==0.12.14
|
30 |
ipython==8.12.0
|
31 |
gradio==4.3.0
|
32 |
-
wavio==0.0.7
|
|
|
|
29 |
wandb==0.12.14
|
30 |
ipython==8.12.0
|
31 |
gradio==4.3.0
|
32 |
+
wavio==0.0.7
|
33 |
+
bigvgan==2.4.1
|