teticio commited on
Commit
08ddd40
1 Parent(s): 034d18c

fix for latent diffusion

Browse files
Files changed (1) hide show
  1. audiodiffusion/__init__.py +8 -3
audiodiffusion/__init__.py CHANGED
@@ -10,7 +10,7 @@ from diffusers import (DiffusionPipeline, DDPMPipeline, UNet2DConditionModel,
10
 
11
  from .mel import Mel
12
 
13
- VERSION = "1.2.1"
14
 
15
 
16
  class AudioDiffusion:
@@ -199,8 +199,11 @@ class AudioDiffusionPipeline(DiffusionPipeline):
199
  self.scheduler.set_timesteps(steps)
200
  step_generator = step_generator or generator
201
  mask = None
 
 
 
202
  images = noise = torch.randn(
203
- (batch_size, self.unet.in_channels, mel.y_res, mel.x_res),
204
  generator=generator)
205
 
206
  if audio_file is not None or raw_audio is not None:
@@ -223,7 +226,9 @@ class AudioDiffusionPipeline(DiffusionPipeline):
223
  torch.tensor(input_images[:, np.newaxis, np.newaxis, :]),
224
  noise, torch.tensor(steps - start_step))
225
 
226
- pixels_per_second = (mel.get_sample_rate() / mel.hop_length)
 
 
227
  mask_start = int(mask_start_secs * pixels_per_second)
228
  mask_end = int(mask_end_secs * pixels_per_second)
229
  mask = self.scheduler.add_noise(
 
10
 
11
  from .mel import Mel
12
 
13
+ VERSION = "1.2.2"
14
 
15
 
16
  class AudioDiffusion:
 
199
  self.scheduler.set_timesteps(steps)
200
  step_generator = step_generator or generator
201
  mask = None
202
+ # For backwards compatiibility
203
+ if type(self.unet.sample_size) == int:
204
+ self.unet.sample_size = (self.unet.sample_size, self.unet.sample_size)
205
  images = noise = torch.randn(
206
+ (batch_size, self.unet.in_channels) + self.unet.sample_size,
207
  generator=generator)
208
 
209
  if audio_file is not None or raw_audio is not None:
 
226
  torch.tensor(input_images[:, np.newaxis, np.newaxis, :]),
227
  noise, torch.tensor(steps - start_step))
228
 
229
+ pixels_per_second = (self.unet.sample_size[1] *
230
+ mel.get_sample_rate() / mel.x_res /
231
+ mel.hop_length)
232
  mask_start = int(mask_start_secs * pixels_per_second)
233
  mask_end = int(mask_end_secs * pixels_per_second)
234
  mask = self.scheduler.add_noise(