jacob-c commited on
Commit
c1f2d61
1 Parent(s): 48225e6

change to bigvan

Browse files
audioldm/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/audioldm/__pycache__/__init__.cpython-310.pyc and b/audioldm/__pycache__/__init__.cpython-310.pyc differ
 
audioldm/__pycache__/ldm.cpython-310.pyc CHANGED
Binary files a/audioldm/__pycache__/ldm.cpython-310.pyc and b/audioldm/__pycache__/ldm.cpython-310.pyc differ
 
audioldm/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/audioldm/__pycache__/utils.cpython-310.pyc and b/audioldm/__pycache__/utils.cpython-310.pyc differ
 
audioldm/bigvgan/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import bigvgan
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ class BigVGANVocoder:
6
+ def __init__(self, device='cuda', use_cuda_kernel=False):
7
+ # Load the pretrained model
8
+ self.model = bigvgan.BigVGAN.from_pretrained(
9
+ 'nvidia/bigvgan_v2_44khz_128band_512x',
10
+ use_cuda_kernel=use_cuda_kernel
11
+ )
12
+ self.model.remove_weight_norm()
13
+ self.model.eval().to(device)
14
+ self.device = device
15
+ self.h = self.model.h # This holds config like sampling_rate, etc.
16
+
17
+ @torch.no_grad()
18
+ def infer_waveform(self, mel):
19
+ # mel shape: [B, n_mels, T], BigVGAN expects mel at model.h.n_mels, typically 128
20
+ mel = mel.to(self.device)
21
+ wav_gen = self.model(mel)
22
+ return wav_gen.squeeze(1) # Returns [B, T]
audioldm/bigvgan/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.03 kB). View file
 
audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc differ
 
audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc differ
 
audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc differ
 
audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc differ
 
audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc CHANGED
Binary files a/audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc and b/audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc differ
 
audioldm/ldm.py CHANGED
@@ -234,14 +234,16 @@ class LatentDiffusion(DDPM):
234
  return self.first_stage_model.decode(z)
235
 
236
  def mel_spectrogram_to_waveform(self, mel):
237
- # Mel: [bs, 1, t-steps, fbins]
 
238
  if len(mel.size()) == 4:
239
- mel = mel.squeeze(1)
240
- mel = mel.permute(0, 2, 1)
241
- waveform = self.first_stage_model.vocoder(mel)
242
  waveform = waveform.cpu().detach().numpy()
243
  return waveform
244
 
 
245
  @torch.no_grad()
246
  def encode_first_stage(self, x):
247
  return self.first_stage_model.encode(x)
 
234
  return self.first_stage_model.decode(z)
235
 
236
  def mel_spectrogram_to_waveform(self, mel):
237
+ # Originally: self.first_stage_model.vocoder(mel)
238
+ # We'll call BigVGAN here instead.
239
  if len(mel.size()) == 4:
240
+ mel = mel.squeeze(1) # shape: [B, time, n_mels]
241
+ mel = mel.permute(0, 2, 1) # BigVGAN expects [B, n_mels, T]
242
+ waveform = self.vocoder.infer_waveform(mel) # using BigVGAN vocoder now
243
  waveform = waveform.cpu().detach().numpy()
244
  return waveform
245
 
246
+
247
  @torch.no_grad()
248
  def encode_first_stage(self, x):
249
  return self.first_stage_model.encode(x)
audioldm/pipeline.py CHANGED
@@ -10,7 +10,11 @@ from audioldm import LatentDiffusion, seed_everything
10
  from audioldm.utils import default_audioldm_config, get_duration, get_bit_depth, get_metadata, download_checkpoint
11
  from audioldm.audio import wav_to_fbank, TacotronSTFT, read_wav_file
12
  from audioldm.latent_diffusion.ddim import DDIMSampler
 
13
  from einops import repeat
 
 
 
14
  import os
15
 
16
  def make_batch_for_text_to_audio(text, waveform=None, fbank=None, batchsize=1):
@@ -89,6 +93,9 @@ def build_model(
89
  latent_diffusion = latent_diffusion.to(device)
90
 
91
  latent_diffusion.cond_stage_model.embed_mode = "text"
 
 
 
92
  return latent_diffusion
93
 
94
  def duration_to_latent_t_size(duration):
 
10
  from audioldm.utils import default_audioldm_config, get_duration, get_bit_depth, get_metadata, download_checkpoint
11
  from audioldm.audio import wav_to_fbank, TacotronSTFT, read_wav_file
12
  from audioldm.latent_diffusion.ddim import DDIMSampler
13
+ from audioldm.bigvgan import BigVGANVocoder
14
  from einops import repeat
15
+ from scipy.signal import convolve
16
+ import numpy as np
17
+
18
  import os
19
 
20
  def make_batch_for_text_to_audio(text, waveform=None, fbank=None, batchsize=1):
 
93
  latent_diffusion = latent_diffusion.to(device)
94
 
95
  latent_diffusion.cond_stage_model.embed_mode = "text"
96
+ # Here is where you add the BigVGAN vocoder initialization
97
+
98
+ latent_diffusion.vocoder = BigVGANVocoder(device='cuda', use_cuda_kernel=False)
99
  return latent_diffusion
100
 
101
  def duration_to_latent_t_size(duration):
audioldm/utils.py CHANGED
@@ -106,10 +106,10 @@ def default_audioldm_config(model_name="audioldm-s-full"):
106
  "root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
107
  },
108
  "preprocessing": {
109
- "audio": {"sampling_rate": 16000, "max_wav_value": 32768},
110
  "stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
111
  "mel": {
112
- "n_mel_channels": 64,
113
  "mel_fmin": 0,
114
  "mel_fmax": 8000,
115
  "freqm": 0,
 
106
  "root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
107
  },
108
  "preprocessing": {
109
+ "audio": {"sampling_rate": 44100, "max_wav_value": 32768},
110
  "stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
111
  "mel": {
112
+ "n_mel_channels": 128,
113
  "mel_fmin": 0,
114
  "mel_fmax": 8000,
115
  "freqm": 0,
audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc and b/audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc differ
 
audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc CHANGED
Binary files a/audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc and b/audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc differ
 
audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc CHANGED
Binary files a/audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc and b/audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc differ
 
audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc CHANGED
Binary files a/audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc and b/audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc differ
 
audioldm/variational_autoencoder/autoencoder.py CHANGED
@@ -3,7 +3,11 @@ from audioldm.latent_diffusion.ema import *
3
  from audioldm.variational_autoencoder.modules import Encoder, Decoder
4
  from audioldm.variational_autoencoder.distributions import DiagonalGaussianDistribution
5
 
6
- from audioldm.hifigan.utilities import get_vocoder, vocoder_infer
 
 
 
 
7
 
8
 
9
  class AutoencoderKL(nn.Module):
@@ -36,7 +40,8 @@ class AutoencoderKL(nn.Module):
36
  self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
37
  self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
38
 
39
- self.vocoder = get_vocoder(None, "cpu")
 
40
  self.embed_dim = embed_dim
41
 
42
  if monitor is not None:
@@ -65,7 +70,8 @@ class AutoencoderKL(nn.Module):
65
 
66
  def decode_to_waveform(self, dec):
67
  dec = dec.squeeze(1).permute(0, 2, 1)
68
- wav_reconstruction = vocoder_infer(dec, self.vocoder)
 
69
  return wav_reconstruction
70
 
71
  def forward(self, input, sample_posterior=True):
 
3
  from audioldm.variational_autoencoder.modules import Encoder, Decoder
4
  from audioldm.variational_autoencoder.distributions import DiagonalGaussianDistribution
5
 
6
+ #from audioldm.hifigan.utilities import get_vocoder, vocoder_infer
7
+ from audioldm.bigvgan import BigVGANVocoder
8
+
9
+ # After you create your latent_diffusion instance:
10
+ latent_diffusion.vocoder = BigVGANVocoder(device='cuda', use_cuda_kernel=False)
11
 
12
 
13
  class AutoencoderKL(nn.Module):
 
40
  self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
41
  self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
42
 
43
+ self.vocoder = BigVGANVocoder(device=device)
44
+ #self.vocoder = get_vocoder(None, "cpu")
45
  self.embed_dim = embed_dim
46
 
47
  if monitor is not None:
 
70
 
71
  def decode_to_waveform(self, dec):
72
  dec = dec.squeeze(1).permute(0, 2, 1)
73
+ wav_reconstruction = vocoder.infer_waveform(mels)
74
+ #wav_reconstruction = vocoder_infer(dec, self.vocoder)
75
  return wav_reconstruction
76
 
77
  def forward(self, input, sample_posterior=True):
requirements.txt CHANGED
@@ -29,4 +29,5 @@ tqdm==4.63.1
29
  wandb==0.12.14
30
  ipython==8.12.0
31
  gradio==4.3.0
32
- wavio==0.0.7
 
 
29
  wandb==0.12.14
30
  ipython==8.12.0
31
  gradio==4.3.0
32
+ wavio==0.0.7
33
+ bigvgan==2.4.1