teticio commited on
Commit
4033ea7
1 Parent(s): 327bccf

move sample_size out of pipeline

Browse files
audiodiffusion/__init__.py CHANGED
@@ -10,14 +10,13 @@ from diffusers import (DiffusionPipeline, DDPMPipeline, UNet2DConditionModel,
10
 
11
  from .mel import Mel
12
 
13
- VERSION = "1.2.0"
14
 
15
 
16
  class AudioDiffusion:
17
 
18
  def __init__(self,
19
  model_id: str = "teticio/audio-diffusion-256",
20
- resolution: int = 256,
21
  sample_rate: int = 22050,
22
  n_fft: int = 2048,
23
  hop_length: int = 512,
@@ -28,7 +27,6 @@ class AudioDiffusion:
28
 
29
  Args:
30
  model_id (String): name of model (local directory or Hugging Face Hub)
31
- resolution (int): size of square mel spectrogram in pixels
32
  sample_rate (int): sample rate of audio
33
  n_fft (int): number of Fast Fourier Transforms
34
  hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
@@ -36,12 +34,6 @@ class AudioDiffusion:
36
  cuda (bool): use CUDA?
37
  progress_bar (iterable): iterable callback for progress updates or None
38
  """
39
- self.mel = Mel(x_res=resolution,
40
- y_res=resolution,
41
- sample_rate=sample_rate,
42
- n_fft=n_fft,
43
- hop_length=hop_length,
44
- top_db=top_db)
45
  self.model_id = model_id
46
  pipeline = {
47
  'LatentAudioDiffusionPipeline': LatentAudioDiffusionPipeline,
@@ -54,6 +46,18 @@ class AudioDiffusion:
54
  self.pipe.to("cuda")
55
  self.progress_bar = progress_bar or (lambda _: _)
56
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def generate_spectrogram_and_audio(
58
  self,
59
  steps: int = None,
@@ -180,12 +184,9 @@ class AudioDiffusionPipeline(DiffusionPipeline):
180
  if steps is not None:
181
  self.scheduler.set_timesteps(steps)
182
  mask = None
183
- # For backwards compatibility
184
- sample_size = (self.unet.sample_size, self.unet.sample_size) if type(
185
- self.unet.sample_size) == int else self.unet.sample_size
186
- images = noise = torch.randn((batch_size, self.unet.in_channels) +
187
- sample_size,
188
- generator=generator)
189
 
190
  if audio_file is not None or raw_audio is not None:
191
  mel.load_audio(audio_file, raw_audio)
@@ -207,8 +208,7 @@ class AudioDiffusionPipeline(DiffusionPipeline):
207
  torch.tensor(input_images[:, np.newaxis, np.newaxis, :]),
208
  noise, torch.tensor(steps - start_step))
209
 
210
- pixels_per_second = (mel.get_sample_rate() * sample_size[1] /
211
- mel.hop_length / mel.x_res)
212
  mask_start = int(mask_start_secs * pixels_per_second)
213
  mask_end = int(mask_end_secs * pixels_per_second)
214
  mask = self.scheduler.add_noise(
 
10
 
11
  from .mel import Mel
12
 
13
+ VERSION = "1.2.1"
14
 
15
 
16
  class AudioDiffusion:
17
 
18
  def __init__(self,
19
  model_id: str = "teticio/audio-diffusion-256",
 
20
  sample_rate: int = 22050,
21
  n_fft: int = 2048,
22
  hop_length: int = 512,
 
27
 
28
  Args:
29
  model_id (String): name of model (local directory or Hugging Face Hub)
 
30
  sample_rate (int): sample rate of audio
31
  n_fft (int): number of Fast Fourier Transforms
32
  hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
 
34
  cuda (bool): use CUDA?
35
  progress_bar (iterable): iterable callback for progress updates or None
36
  """
 
 
 
 
 
 
37
  self.model_id = model_id
38
  pipeline = {
39
  'LatentAudioDiffusionPipeline': LatentAudioDiffusionPipeline,
 
46
  self.pipe.to("cuda")
47
  self.progress_bar = progress_bar or (lambda _: _)
48
 
49
+ # For backwards compatibility
50
+ sample_size = (self.pipe.unet.sample_size,
51
+ self.pipe.unet.sample_size) if type(
52
+ self.pipe.unet.sample_size
53
+ ) == int else self.pipe.unet.sample_size
54
+ self.mel = Mel(x_res=sample_size[1],
55
+ y_res=sample_size[0],
56
+ sample_rate=sample_rate,
57
+ n_fft=n_fft,
58
+ hop_length=hop_length,
59
+ top_db=top_db)
60
+
61
  def generate_spectrogram_and_audio(
62
  self,
63
  steps: int = None,
 
184
  if steps is not None:
185
  self.scheduler.set_timesteps(steps)
186
  mask = None
187
+ images = noise = torch.randn(
188
+ (batch_size, self.unet.in_channels, mel.y_res, mel.x_res),
189
+ generator=generator)
 
 
 
190
 
191
  if audio_file is not None or raw_audio is not None:
192
  mel.load_audio(audio_file, raw_audio)
 
208
  torch.tensor(input_images[:, np.newaxis, np.newaxis, :]),
209
  noise, torch.tensor(steps - start_step))
210
 
211
+ pixels_per_second = (mel.get_sample_rate() / mel.hop_length)
 
212
  mask_start = int(mask_start_secs * pixels_per_second)
213
  mask_end = int(mask_end_secs * pixels_per_second)
214
  mask = self.scheduler.add_noise(
notebooks/test_model.ipynb CHANGED
The diff for this file is too large to render. See raw diff