fffiloni commited on
Commit
8e4cf69
·
1 Parent(s): f7c49f8

Update animatediff/pipelines/pipeline_animation.py

Browse files
animatediff/pipelines/pipeline_animation.py CHANGED
@@ -317,25 +317,28 @@ class AnimationPipeline(DiffusionPipeline):
317
  rand_device = "cpu" if device.type == "mps" else device
318
 
319
  if isinstance(generator, list):
320
- # Initialize latents as a random tensor
321
- latents = torch.randn(shape, device=rand_device, dtype=dtype)
 
 
 
 
 
 
322
 
323
- # If init_latents is not None, copy the values for each video frame
324
- if init_latents is not None:
325
- for i in range(video_length):
326
- init_alpha = (video_length - float(i)) / video_length / 30
327
- latents[:, :, i, :, :] = init_latents * init_alpha + latents[:, :, i, :, :] * (1 - init_alpha)
328
 
329
- latents = latents.to(device)
330
  else:
331
  latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
332
-
333
- # If init_latents is not None, repeat it for the entire batch
334
  if init_latents is not None:
335
- init_latents = init_latents.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
336
  for i in range(video_length):
337
- init_alpha = (video_length - float(i)) / video_length / 30
338
- latents[:, :, i, :, :] = init_latents[:, :, i, :, :] * init_alpha + latents[:, :, i, :, :] * (1 - init_alpha)
 
 
 
 
 
339
 
340
  else:
341
  if latents.shape != shape:
 
317
  rand_device = "cpu" if device.type == "mps" else device
318
 
319
  if isinstance(generator, list):
320
+ shape = shape
321
+ # shape = (1,) + shape[1:]
322
+ # ignore init latents for batch model
323
+ latents = [
324
+ torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
325
+ for i in range(batch_size)
326
+ ]
327
+ latents = torch.cat(latents, dim=0).to(device)
328
 
 
 
 
 
 
329
 
 
330
  else:
331
  latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
 
 
332
  if init_latents is not None:
333
+
334
  for i in range(video_length):
335
+ # I just feel dividing by 30 yield stable result but I don't know why
336
+ # gradully reduce init alpha along video frames (loosen restriction)
337
+ init_alpha = (video_length - float(i)) / video_length / 30
338
+ latents[:, :, i, :, :] = init_latents * init_alpha + latents[:, :, i, :, :] * (1 - init_alpha)
339
+
340
+
341
+
342
 
343
  else:
344
  if latents.shape != shape: