alex commited on
Commit
c727d79
·
1 Parent(s): 9d08de1

warning fixed

Browse files
Files changed (4) hide show
  1. app.py +10 -7
  2. ovi/modules/model.py +13 -13
  3. ovi/modules/vae.py +6 -6
  4. ovi/modules/vae2_2.py +5 -5
app.py CHANGED
@@ -143,6 +143,7 @@ def resize_for_model(image_path):
143
 
144
  @spaces.GPU()
145
  def generate_image(text_prompt, session_id, image_height = 1024, image_width = 1024):
 
146
 
147
  if flux_model is None:
148
  return None
@@ -195,6 +196,8 @@ def generate_scene(
195
 
196
  if session_id is None:
197
  session_id = uuid.uuid4().hex
 
 
198
 
199
  return generate_video(text_prompt,
200
  sample_steps,
@@ -346,7 +349,7 @@ with gr.Blocks(css=css, theme=theme) as demo:
346
  lines=5,
347
  placeholder="Describe your scene...")
348
  sample_steps = gr.Slider(
349
- value=50,
350
  label="Sample Steps",
351
  minimum=20,
352
  maximum=100,
@@ -393,40 +396,40 @@ with gr.Blocks(css=css, theme=theme) as demo:
393
 
394
  [
395
  "What's the difference between having a job and having no life?",
396
- 50,
397
  "example_prompts/pngs/91.png",
398
  ],
399
 
400
 
401
  [
402
  "a alien creature looking to the right and slowly turning to the camera while drooling from her teeth and says <S>Hiss, You thought I can't talk.<E> then start screaming in a high pitch voice <AUDCAP>the alien has a raspy voice<ENDAUDCAP>",
403
- 50,
404
  "example_prompts/pngs/90.png",
405
  ],
406
 
407
 
408
  [
409
  "The video opens with a close-up of a woman with vibrant reddish-orange, shoulder-length hair and heavy dark eye makeup. She is wearing a dark brown leather jacket over a grey hooded top. She looks intently to her right, her mouth slightly agape, and her expression is serious and focused. The background shows a room with light green walls and dark wooden cabinets on the left, and a green plant on the right. She speaks, her voice clear and direct, saying, <S>doing<E>. She then pauses briefly, her gaze unwavering, and continues, <S>And I need you to trust them.<E>. Her mouth remains slightly open, indicating she is either about to speak more or has just finished a sentence, with a look of intense sincerity.. <AUDCAP>Tense, dramatic background music, clear female voice.<ENDAUDCAP>",
410
- 50,
411
  image_example,
412
  ],
413
 
414
  [
415
  "A young woman with long, wavy blonde hair and light-colored eyes is shown in a medium shot against a blurred backdrop of lush green foliage. She wears a denim jacket over a striped top. Initially, her eyes are closed and her mouth is slightly open as she speaks, <S>Enjoy this moment<E>. Her eyes then slowly open, looking slightly upwards and to the right, as her expression shifts to one of thoughtful contemplation. She continues to speak, <S>No matter where it's taking<E>, her gaze then settling with a serious and focused look towards someone off-screen to her right.. <AUDCAP>Clear female voice, faint ambient outdoor sounds.<ENDAUDCAP>",
416
- 50,
417
  "example_prompts/pngs/2.png",
418
  ],
419
 
420
  [
421
  "A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the ""CHOICE FM"" logo and various social media handles like ""@ilovechoicefm"" with ""RALEIGH"" below it. The man intently addresses the microphone, articulating, <S>is talent. It's all about authenticity. You gotta be who you really are, especially if you're working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>",
422
- 50,
423
  "example_prompts/pngs/5.png",
424
  ],
425
 
426
 
427
  [
428
  "The scene is set outdoors with a blurry, bright green background, suggesting grass and a sunny environment. On the left, a woman with long, dark hair, wearing a red top and a necklace with a white pendant, faces towards the right. Her expression is serious and slightly perturbed as she speaks, with her lips slightly pursed. She says, <S>UFO, UFC thing.<E> On the right, the back of a man's head and his right ear are visible, indicating he is facing away from the camera, listening to the woman. He has short, dark hair. The woman continues speaking, her expression remaining serious, <S>And if you're not watching that, it's one of those ancient movies from an era that's<E> as the frame holds steady on the two figures.. <AUDCAP>Clear female speech, distant low-frequency hum.<ENDAUDCAP>",
429
- 50,
430
  "example_prompts/pngs/9.png",
431
  ],
432
 
 
143
 
144
  @spaces.GPU()
145
  def generate_image(text_prompt, session_id, image_height = 1024, image_width = 1024):
146
+ print("image generation used")
147
 
148
  if flux_model is None:
149
  return None
 
196
 
197
  if session_id is None:
198
  session_id = uuid.uuid4().hex
199
+
200
+ print(f"generating scene with {sample_steps} steps")
201
 
202
  return generate_video(text_prompt,
203
  sample_steps,
 
349
  lines=5,
350
  placeholder="Describe your scene...")
351
  sample_steps = gr.Slider(
352
+ value=20,
353
  label="Sample Steps",
354
  minimum=20,
355
  maximum=100,
 
396
 
397
  [
398
  "What's the difference between having a job and having no life?",
399
+ 20,
400
  "example_prompts/pngs/91.png",
401
  ],
402
 
403
 
404
  [
405
  "a alien creature looking to the right and slowly turning to the camera while drooling from her teeth and says <S>Hiss, You thought I can't talk.<E> then start screaming in a high pitch voice <AUDCAP>the alien has a raspy voice<ENDAUDCAP>",
406
+ 20,
407
  "example_prompts/pngs/90.png",
408
  ],
409
 
410
 
411
  [
412
  "The video opens with a close-up of a woman with vibrant reddish-orange, shoulder-length hair and heavy dark eye makeup. She is wearing a dark brown leather jacket over a grey hooded top. She looks intently to her right, her mouth slightly agape, and her expression is serious and focused. The background shows a room with light green walls and dark wooden cabinets on the left, and a green plant on the right. She speaks, her voice clear and direct, saying, <S>doing<E>. She then pauses briefly, her gaze unwavering, and continues, <S>And I need you to trust them.<E>. Her mouth remains slightly open, indicating she is either about to speak more or has just finished a sentence, with a look of intense sincerity.. <AUDCAP>Tense, dramatic background music, clear female voice.<ENDAUDCAP>",
413
+ 20,
414
  image_example,
415
  ],
416
 
417
  [
418
  "A young woman with long, wavy blonde hair and light-colored eyes is shown in a medium shot against a blurred backdrop of lush green foliage. She wears a denim jacket over a striped top. Initially, her eyes are closed and her mouth is slightly open as she speaks, <S>Enjoy this moment<E>. Her eyes then slowly open, looking slightly upwards and to the right, as her expression shifts to one of thoughtful contemplation. She continues to speak, <S>No matter where it's taking<E>, her gaze then settling with a serious and focused look towards someone off-screen to her right.. <AUDCAP>Clear female voice, faint ambient outdoor sounds.<ENDAUDCAP>",
419
+ 20,
420
  "example_prompts/pngs/2.png",
421
  ],
422
 
423
  [
424
  "A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the ""CHOICE FM"" logo and various social media handles like ""@ilovechoicefm"" with ""RALEIGH"" below it. The man intently addresses the microphone, articulating, <S>is talent. It's all about authenticity. You gotta be who you really are, especially if you're working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>",
425
+ 20,
426
  "example_prompts/pngs/5.png",
427
  ],
428
 
429
 
430
  [
431
  "The scene is set outdoors with a blurry, bright green background, suggesting grass and a sunny environment. On the left, a woman with long, dark hair, wearing a red top and a necklace with a white pendant, faces towards the right. Her expression is serious and slightly perturbed as she speaks, with her lips slightly pursed. She says, <S>UFO, UFC thing.<E> On the right, the back of a man's head and his right ear are visible, indicating he is facing away from the camera, listening to the woman. He has short, dark hair. The woman continues speaking, her expression remaining serious, <S>And if you're not watching that, it's one of those ancient movies from an era that's<E> as the frame holds steady on the two figures.. <AUDCAP>Clear female speech, distant low-frequency hum.<ENDAUDCAP>",
432
+ 20,
433
  "example_prompts/pngs/9.png",
434
  ],
435
 
ovi/modules/model.py CHANGED
@@ -2,7 +2,7 @@
2
  import math
3
 
4
  import torch
5
- import torch.cuda.amp as amp
6
  import torch.nn as nn
7
  import torch.nn.functional as F
8
 
@@ -34,7 +34,7 @@ def sinusoidal_embedding_1d(dim, position):
34
  return x
35
 
36
 
37
- @amp.autocast(enabled=False)
38
  def rope_params(max_seq_len, dim, theta=10000, freqs_scaling=1.0):
39
  assert dim % 2 == 0
40
  pos = torch.arange(max_seq_len)
@@ -44,7 +44,7 @@ def rope_params(max_seq_len, dim, theta=10000, freqs_scaling=1.0):
44
  freqs = torch.polar(torch.ones_like(freqs), freqs)
45
  return freqs
46
 
47
- @amp.autocast(enabled=False)
48
  def rope_apply_1d(x, grid_sizes, freqs):
49
  n, c = x.size(2), x.size(3) // 2 ## b l h d
50
  c_rope = freqs.shape[1] # number of complex dims to rotate
@@ -69,7 +69,7 @@ def rope_apply_1d(x, grid_sizes, freqs):
69
  output.append(x_i)
70
  return torch.stack(output).bfloat16()
71
 
72
- @amp.autocast(enabled=False)
73
  def rope_apply_3d(x, grid_sizes, freqs):
74
  n, c = x.size(2), x.size(3) // 2
75
 
@@ -99,7 +99,7 @@ def rope_apply_3d(x, grid_sizes, freqs):
99
  output.append(x_i)
100
  return torch.stack(output).bfloat16()
101
 
102
- @amp.autocast(enabled=False)
103
  def rope_apply(x, grid_sizes, freqs):
104
  x_ndim = grid_sizes.shape[-1]
105
  if x_ndim == 3:
@@ -176,7 +176,7 @@ class WanRMSNorm(nn.Module):
176
  Args:
177
  x(Tensor): Shape [B, L, C]
178
  """
179
- return self._norm(x.bfloat16()).type_as(x) * self.weight
180
 
181
  def _norm(self, x):
182
  return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
@@ -371,7 +371,7 @@ class ModulationAdd(nn.Module):
371
  self.modulation = nn.Parameter(torch.randn(1, num, dim) / dim**0.5)
372
 
373
  def forward(self, e):
374
- return self.modulation + e
375
 
376
  class WanAttentionBlock(nn.Module):
377
 
@@ -447,7 +447,7 @@ class WanAttentionBlock(nn.Module):
447
  """
448
  assert e.dtype == torch.bfloat16
449
  assert len(e.shape) == 4 and e.size(2) == 6 and e.shape[1] == x.shape[1], f"{e.shape}, {x.shape}"
450
- with torch.amp.autocast('cuda', dtype=torch.bfloat16):
451
  e = self.modulation(e).chunk(6, dim=2)
452
  assert e[0].dtype == torch.bfloat16
453
 
@@ -455,7 +455,7 @@ class WanAttentionBlock(nn.Module):
455
  y = self.self_attn(
456
  self.norm1(x).bfloat16() * (1 + e[1].squeeze(2)) + e[0].squeeze(2),
457
  seq_lens, grid_sizes, freqs)
458
- with torch.amp.autocast('cuda', dtype=torch.bfloat16):
459
  x = x + y * e[2].squeeze(2)
460
 
461
  # cross-attention & ffn function
@@ -463,7 +463,7 @@ class WanAttentionBlock(nn.Module):
463
  x = x + self.cross_attn(self.norm3(x), context, context_lens)
464
  y = self.ffn(
465
  self.norm2(x).bfloat16() * (1 + e[4].squeeze(2)) + e[3].squeeze(2))
466
- with torch.amp.autocast('cuda', dtype=torch.bfloat16):
467
  x = x + y * e[5].squeeze(2)
468
  return x
469
 
@@ -495,8 +495,8 @@ class Head(nn.Module):
495
  e(Tensor): Shape [B, L, C]
496
  """
497
  assert e.dtype == torch.bfloat16
498
- with torch.amp.autocast('cuda', dtype=torch.bfloat16):
499
- e = (self.modulation.unsqueeze(0) + e.unsqueeze(2)).chunk(2, dim=2) # 1 1 2 D, B L 1 D -> B L 2 D -> 2 * (B L 1 D)
500
  x = (self.head(self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)))
501
  return x
502
 
@@ -740,7 +740,7 @@ class WanModel(ModelMixin, ConfigMixin):
740
  # print(f"zeroing out first {_first_images_seq_len} from t: {t.shape}, {t}")
741
  else:
742
  t = t.unsqueeze(1).expand(t.size(0), seq_len)
743
- with torch.amp.autocast('cuda', dtype=torch.bfloat16):
744
  bt = t.size(0)
745
  t = t.flatten()
746
  e = self.time_embedding(
 
2
  import math
3
 
4
  import torch
5
+ import torch.amp as amp
6
  import torch.nn as nn
7
  import torch.nn.functional as F
8
 
 
34
  return x
35
 
36
 
37
+ @amp.autocast('cuda', enabled=False)
38
  def rope_params(max_seq_len, dim, theta=10000, freqs_scaling=1.0):
39
  assert dim % 2 == 0
40
  pos = torch.arange(max_seq_len)
 
44
  freqs = torch.polar(torch.ones_like(freqs), freqs)
45
  return freqs
46
 
47
+ @amp.autocast('cuda', enabled=False)
48
  def rope_apply_1d(x, grid_sizes, freqs):
49
  n, c = x.size(2), x.size(3) // 2 ## b l h d
50
  c_rope = freqs.shape[1] # number of complex dims to rotate
 
69
  output.append(x_i)
70
  return torch.stack(output).bfloat16()
71
 
72
+ @amp.autocast('cuda', enabled=False)
73
  def rope_apply_3d(x, grid_sizes, freqs):
74
  n, c = x.size(2), x.size(3) // 2
75
 
 
99
  output.append(x_i)
100
  return torch.stack(output).bfloat16()
101
 
102
+ @amp.autocast('cuda', enabled=False)
103
  def rope_apply(x, grid_sizes, freqs):
104
  x_ndim = grid_sizes.shape[-1]
105
  if x_ndim == 3:
 
176
  Args:
177
  x(Tensor): Shape [B, L, C]
178
  """
179
+ return self._norm(x.bfloat16()).type_as(x) * self.weight.bfloat16()
180
 
181
  def _norm(self, x):
182
  return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
 
371
  self.modulation = nn.Parameter(torch.randn(1, num, dim) / dim**0.5)
372
 
373
  def forward(self, e):
374
+ return self.modulation.bfloat16() + e.bfloat16()
375
 
376
  class WanAttentionBlock(nn.Module):
377
 
 
447
  """
448
  assert e.dtype == torch.bfloat16
449
  assert len(e.shape) == 4 and e.size(2) == 6 and e.shape[1] == x.shape[1], f"{e.shape}, {x.shape}"
450
+ with amp.autocast('cuda', dtype=torch.bfloat16):
451
  e = self.modulation(e).chunk(6, dim=2)
452
  assert e[0].dtype == torch.bfloat16
453
 
 
455
  y = self.self_attn(
456
  self.norm1(x).bfloat16() * (1 + e[1].squeeze(2)) + e[0].squeeze(2),
457
  seq_lens, grid_sizes, freqs)
458
+ with amp.autocast('cuda', dtype=torch.bfloat16):
459
  x = x + y * e[2].squeeze(2)
460
 
461
  # cross-attention & ffn function
 
463
  x = x + self.cross_attn(self.norm3(x), context, context_lens)
464
  y = self.ffn(
465
  self.norm2(x).bfloat16() * (1 + e[4].squeeze(2)) + e[3].squeeze(2))
466
+ with amp.autocast('cuda', dtype=torch.bfloat16):
467
  x = x + y * e[5].squeeze(2)
468
  return x
469
 
 
495
  e(Tensor): Shape [B, L, C]
496
  """
497
  assert e.dtype == torch.bfloat16
498
+ with amp.autocast('cuda', dtype=torch.bfloat16):
499
+ e = (self.modulation.bfloat16().unsqueeze(0) + e.unsqueeze(2)).chunk(2, dim=2) # 1 1 2 D, B L 1 D -> B L 2 D -> 2 * (B L 1 D)
500
  x = (self.head(self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)))
501
  return x
502
 
 
740
  # print(f"zeroing out first {_first_images_seq_len} from t: {t.shape}, {t}")
741
  else:
742
  t = t.unsqueeze(1).expand(t.size(0), seq_len)
743
+ with amp.autocast('cuda', dtype=torch.bfloat16):
744
  bt = t.size(0)
745
  t = t.flatten()
746
  e = self.time_embedding(
ovi/modules/vae.py CHANGED
@@ -2,7 +2,7 @@
2
  import logging
3
 
4
  import torch
5
- import torch.cuda.amp as amp
6
  import torch.nn as nn
7
  import torch.nn.functional as F
8
  from einops import rearrange
@@ -672,14 +672,14 @@ class WanVAE:
672
  """
673
  videos: A list of videos each with shape [C, T, H, W].
674
  """
675
- with amp.autocast(dtype=self.dtype):
676
  return [
677
  self.model.encode(u.unsqueeze(0), self.scale).float().squeeze(0)
678
  for u in videos
679
  ]
680
 
681
  def decode(self, zs):
682
- with amp.autocast(dtype=self.dtype):
683
  return [
684
  self.model.decode(u.unsqueeze(0),
685
  self.scale).float().clamp_(-1, 1).squeeze(0)
@@ -688,16 +688,16 @@ class WanVAE:
688
 
689
  @torch.no_grad()
690
  def wrapped_decode(self, z):
691
- with torch.amp.autocast('cuda', dtype=self.dtype):
692
  return self.model.decode(z, self.scale).float().clamp_(-1, 1)
693
 
694
  @torch.no_grad()
695
  def wrapped_decode_stream(self, z):
696
- with torch.amp.autocast('cuda', dtype=self.dtype):
697
  return self.model.decode_stream(z, self.scale).float().clamp_(-1, 1)
698
 
699
  @torch.no_grad()
700
  def wrapped_encode(self, video):
701
- with torch.amp.autocast('cuda', dtype=self.dtype):
702
  return self.model.encode(video, self.scale).float()
703
 
 
2
  import logging
3
 
4
  import torch
5
+ import torch.amp as amp
6
  import torch.nn as nn
7
  import torch.nn.functional as F
8
  from einops import rearrange
 
672
  """
673
  videos: A list of videos each with shape [C, T, H, W].
674
  """
675
+ with amp.autocast('cuda', dtype=self.dtype):
676
  return [
677
  self.model.encode(u.unsqueeze(0), self.scale).float().squeeze(0)
678
  for u in videos
679
  ]
680
 
681
  def decode(self, zs):
682
+ with amp.autocast('cuda', dtype=self.dtype):
683
  return [
684
  self.model.decode(u.unsqueeze(0),
685
  self.scale).float().clamp_(-1, 1).squeeze(0)
 
688
 
689
  @torch.no_grad()
690
  def wrapped_decode(self, z):
691
+ with amp.autocast('cuda', dtype=self.dtype):
692
  return self.model.decode(z, self.scale).float().clamp_(-1, 1)
693
 
694
  @torch.no_grad()
695
  def wrapped_decode_stream(self, z):
696
+ with amp.autocast('cuda', dtype=self.dtype):
697
  return self.model.decode_stream(z, self.scale).float().clamp_(-1, 1)
698
 
699
  @torch.no_grad()
700
  def wrapped_encode(self, video):
701
+ with amp.autocast('cuda', dtype=self.dtype):
702
  return self.model.encode(video, self.scale).float()
703
 
ovi/modules/vae2_2.py CHANGED
@@ -2,7 +2,7 @@
2
  import logging
3
 
4
  import torch
5
- import torch.cuda.amp as amp
6
  import torch.nn as nn
7
  import torch.nn.functional as F
8
  from einops import rearrange
@@ -1025,7 +1025,7 @@ class Wan2_2_VAE:
1025
  try:
1026
  if not isinstance(videos, list):
1027
  raise TypeError("videos should be a list")
1028
- with amp.autocast(dtype=self.dtype):
1029
  return [
1030
  self.model.encode(u.unsqueeze(0),
1031
  self.scale).float().squeeze(0)
@@ -1039,7 +1039,7 @@ class Wan2_2_VAE:
1039
  try:
1040
  if not isinstance(zs, list):
1041
  raise TypeError("zs should be a list")
1042
- with amp.autocast(dtype=self.dtype):
1043
  return [
1044
  self.model.decode(u.unsqueeze(0),
1045
  self.scale).float().clamp_(-1,
@@ -1054,7 +1054,7 @@ class Wan2_2_VAE:
1054
  try:
1055
  if not isinstance(zs, torch.Tensor):
1056
  raise TypeError("zs should be a torch.Tensor")
1057
- with amp.autocast(dtype=self.dtype):
1058
  return self.model.decode(zs, self.scale).float().clamp_(-1,
1059
  1)
1060
 
@@ -1066,7 +1066,7 @@ class Wan2_2_VAE:
1066
  try:
1067
  if not isinstance(video, torch.Tensor):
1068
  raise TypeError("video should be a torch.Tensor")
1069
- with amp.autocast(dtype=self.dtype):
1070
 
1071
  return self.model.encode(video, self.scale).float()
1072
 
 
2
  import logging
3
 
4
  import torch
5
+ import torch.amp as amp
6
  import torch.nn as nn
7
  import torch.nn.functional as F
8
  from einops import rearrange
 
1025
  try:
1026
  if not isinstance(videos, list):
1027
  raise TypeError("videos should be a list")
1028
+ with amp.autocast('cuda', dtype=self.dtype):
1029
  return [
1030
  self.model.encode(u.unsqueeze(0),
1031
  self.scale).float().squeeze(0)
 
1039
  try:
1040
  if not isinstance(zs, list):
1041
  raise TypeError("zs should be a list")
1042
+ with amp.autocast('cuda', dtype=self.dtype):
1043
  return [
1044
  self.model.decode(u.unsqueeze(0),
1045
  self.scale).float().clamp_(-1,
 
1054
  try:
1055
  if not isinstance(zs, torch.Tensor):
1056
  raise TypeError("zs should be a torch.Tensor")
1057
+ with amp.autocast('cuda', dtype=self.dtype):
1058
  return self.model.decode(zs, self.scale).float().clamp_(-1,
1059
  1)
1060
 
 
1066
  try:
1067
  if not isinstance(video, torch.Tensor):
1068
  raise TypeError("video should be a torch.Tensor")
1069
+ with amp.autocast('cuda', dtype=self.dtype):
1070
 
1071
  return self.model.encode(video, self.scale).float()
1072