OVI

Running on Zero

App Files Files Community

alex commited on 18 days ago

Commit

c727d79

1 Parent(s): 9d08de1

warning fixed

Browse files

Files changed (4) hide show

app.py +10 -7
ovi/modules/model.py +13 -13
ovi/modules/vae.py +6 -6
ovi/modules/vae2_2.py +5 -5

app.py CHANGED Viewed

@@ -143,6 +143,7 @@ def resize_for_model(image_path):
 @spaces.GPU()
 def generate_image(text_prompt, session_id, image_height = 1024, image_width = 1024):
     if flux_model is None:
         return None
@@ -195,6 +196,8 @@ def generate_scene(
     if session_id is None:
         session_id = uuid.uuid4().hex
     return generate_video(text_prompt,
                         sample_steps,
@@ -346,7 +349,7 @@ with gr.Blocks(css=css, theme=theme) as demo:
                                                lines=5,
                                                placeholder="Describe your scene...")
                 sample_steps = gr.Slider(
-                    value=50,
                     label="Sample Steps",
                     minimum=20,
                     maximum=100,
@@ -393,40 +396,40 @@ with gr.Blocks(css=css, theme=theme) as demo:
                         [
                             "What's the difference between having a job and having no life?",
-                            50,
                             "example_prompts/pngs/91.png",
                         ],
                         [
                             "a alien creature looking to the right and slowly turning to the camera while drooling from her teeth and says <S>Hiss, You thought I can't talk.<E> then start screaming in a high pitch voice <AUDCAP>the alien has a raspy voice<ENDAUDCAP>",
-                            50,
                             "example_prompts/pngs/90.png",
                         ],
                         [
                             "The video opens with a close-up of a woman with vibrant reddish-orange, shoulder-length hair and heavy dark eye makeup. She is wearing a dark brown leather jacket over a grey hooded top. She looks intently to her right, her mouth slightly agape, and her expression is serious and focused. The background shows a room with light green walls and dark wooden cabinets on the left, and a green plant on the right. She speaks, her voice clear and direct, saying, <S>doing<E>. She then pauses briefly, her gaze unwavering, and continues, <S>And I need you to trust them.<E>. Her mouth remains slightly open, indicating she is either about to speak more or has just finished a sentence, with a look of intense sincerity.. <AUDCAP>Tense, dramatic background music, clear female voice.<ENDAUDCAP>",
-                            50,
                             image_example,
                         ],
                         [
                             "A young woman with long, wavy blonde hair and light-colored eyes is shown in a medium shot against a blurred backdrop of lush green foliage. She wears a denim jacket over a striped top. Initially, her eyes are closed and her mouth is slightly open as she speaks, <S>Enjoy this moment<E>. Her eyes then slowly open, looking slightly upwards and to the right, as her expression shifts to one of thoughtful contemplation. She continues to speak, <S>No matter where it's taking<E>, her gaze then settling with a serious and focused look towards someone off-screen to her right.. <AUDCAP>Clear female voice, faint ambient outdoor sounds.<ENDAUDCAP>",
-                            50,
                             "example_prompts/pngs/2.png",
                         ],
                         [
                             "A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the ""CHOICE FM"" logo and various social media handles like ""@ilovechoicefm"" with ""RALEIGH"" below it. The man intently addresses the microphone, articulating, <S>is talent. It's all about authenticity. You gotta be who you really are, especially if you're working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>",
-                            50,
                             "example_prompts/pngs/5.png",
                         ],
                         [
                             "The scene is set outdoors with a blurry, bright green background, suggesting grass and a sunny environment. On the left, a woman with long, dark hair, wearing a red top and a necklace with a white pendant, faces towards the right. Her expression is serious and slightly perturbed as she speaks, with her lips slightly pursed. She says, <S>UFO, UFC thing.<E> On the right, the back of a man's head and his right ear are visible, indicating he is facing away from the camera, listening to the woman. He has short, dark hair. The woman continues speaking, her expression remaining serious, <S>And if you're not watching that, it's one of those ancient movies from an era that's<E> as the frame holds steady on the two figures.. <AUDCAP>Clear female speech, distant low-frequency hum.<ENDAUDCAP>",
-                            50,
                             "example_prompts/pngs/9.png",
                         ],

 @spaces.GPU()
 def generate_image(text_prompt, session_id, image_height = 1024, image_width = 1024):
+    print("image generation used")
     if flux_model is None:
         return None
     if session_id is None:
         session_id = uuid.uuid4().hex
+    print(f"generating scene with {sample_steps} steps")
     return generate_video(text_prompt,
                         sample_steps,
                                                lines=5,
                                                placeholder="Describe your scene...")
                 sample_steps = gr.Slider(
+                    value=20,
                     label="Sample Steps",
                     minimum=20,
                     maximum=100,
                         [
                             "What's the difference between having a job and having no life?",
+                            20,
                             "example_prompts/pngs/91.png",
                         ],
                         [
                             "a alien creature looking to the right and slowly turning to the camera while drooling from her teeth and says <S>Hiss, You thought I can't talk.<E> then start screaming in a high pitch voice <AUDCAP>the alien has a raspy voice<ENDAUDCAP>",
+                            20,
                             "example_prompts/pngs/90.png",
                         ],
                         [
                             "The video opens with a close-up of a woman with vibrant reddish-orange, shoulder-length hair and heavy dark eye makeup. She is wearing a dark brown leather jacket over a grey hooded top. She looks intently to her right, her mouth slightly agape, and her expression is serious and focused. The background shows a room with light green walls and dark wooden cabinets on the left, and a green plant on the right. She speaks, her voice clear and direct, saying, <S>doing<E>. She then pauses briefly, her gaze unwavering, and continues, <S>And I need you to trust them.<E>. Her mouth remains slightly open, indicating she is either about to speak more or has just finished a sentence, with a look of intense sincerity.. <AUDCAP>Tense, dramatic background music, clear female voice.<ENDAUDCAP>",
+                            20,
                             image_example,
                         ],
                         [
                             "A young woman with long, wavy blonde hair and light-colored eyes is shown in a medium shot against a blurred backdrop of lush green foliage. She wears a denim jacket over a striped top. Initially, her eyes are closed and her mouth is slightly open as she speaks, <S>Enjoy this moment<E>. Her eyes then slowly open, looking slightly upwards and to the right, as her expression shifts to one of thoughtful contemplation. She continues to speak, <S>No matter where it's taking<E>, her gaze then settling with a serious and focused look towards someone off-screen to her right.. <AUDCAP>Clear female voice, faint ambient outdoor sounds.<ENDAUDCAP>",
+                            20,
                             "example_prompts/pngs/2.png",
                         ],
                         [
                             "A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the ""CHOICE FM"" logo and various social media handles like ""@ilovechoicefm"" with ""RALEIGH"" below it. The man intently addresses the microphone, articulating, <S>is talent. It's all about authenticity. You gotta be who you really are, especially if you're working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>",
+                            20,
                             "example_prompts/pngs/5.png",
                         ],
                         [
                             "The scene is set outdoors with a blurry, bright green background, suggesting grass and a sunny environment. On the left, a woman with long, dark hair, wearing a red top and a necklace with a white pendant, faces towards the right. Her expression is serious and slightly perturbed as she speaks, with her lips slightly pursed. She says, <S>UFO, UFC thing.<E> On the right, the back of a man's head and his right ear are visible, indicating he is facing away from the camera, listening to the woman. He has short, dark hair. The woman continues speaking, her expression remaining serious, <S>And if you're not watching that, it's one of those ancient movies from an era that's<E> as the frame holds steady on the two figures.. <AUDCAP>Clear female speech, distant low-frequency hum.<ENDAUDCAP>",
+                            20,
                             "example_prompts/pngs/9.png",
                         ],

ovi/modules/model.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import math
 import torch
-import torch.cuda.amp as amp
 import torch.nn as nn
 import torch.nn.functional as F
@@ -34,7 +34,7 @@ def sinusoidal_embedding_1d(dim, position):
     return x
-@amp.autocast(enabled=False)
 def rope_params(max_seq_len, dim, theta=10000, freqs_scaling=1.0):
     assert dim % 2 == 0
     pos =  torch.arange(max_seq_len)
@@ -44,7 +44,7 @@ def rope_params(max_seq_len, dim, theta=10000, freqs_scaling=1.0):
     freqs = torch.polar(torch.ones_like(freqs), freqs)
     return freqs
-@amp.autocast(enabled=False)
 def rope_apply_1d(x, grid_sizes, freqs):
     n, c = x.size(2), x.size(3) // 2 ## b l h d
     c_rope = freqs.shape[1]  # number of complex dims to rotate
@@ -69,7 +69,7 @@ def rope_apply_1d(x, grid_sizes, freqs):
         output.append(x_i)
     return torch.stack(output).bfloat16()
-@amp.autocast(enabled=False)
 def rope_apply_3d(x, grid_sizes, freqs):
     n, c = x.size(2), x.size(3) // 2
@@ -99,7 +99,7 @@ def rope_apply_3d(x, grid_sizes, freqs):
         output.append(x_i)
     return torch.stack(output).bfloat16()
-@amp.autocast(enabled=False)
 def rope_apply(x, grid_sizes, freqs):
     x_ndim = grid_sizes.shape[-1]
     if x_ndim == 3:
@@ -176,7 +176,7 @@ class WanRMSNorm(nn.Module):
         Args:
             x(Tensor): Shape [B, L, C]
         """
-        return self._norm(x.bfloat16()).type_as(x) * self.weight
     def _norm(self, x):
         return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
@@ -371,7 +371,7 @@ class ModulationAdd(nn.Module):
         self.modulation = nn.Parameter(torch.randn(1, num, dim) / dim**0.5)
     def forward(self, e):
-        return self.modulation + e
 class WanAttentionBlock(nn.Module):
@@ -447,7 +447,7 @@ class WanAttentionBlock(nn.Module):
         """
         assert e.dtype == torch.bfloat16
         assert len(e.shape) == 4 and e.size(2) == 6 and e.shape[1] == x.shape[1], f"{e.shape}, {x.shape}"
-        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             e = self.modulation(e).chunk(6, dim=2)
         assert e[0].dtype == torch.bfloat16
@@ -455,7 +455,7 @@ class WanAttentionBlock(nn.Module):
         y = self.self_attn(
             self.norm1(x).bfloat16() * (1 + e[1].squeeze(2)) + e[0].squeeze(2),
             seq_lens, grid_sizes, freqs)
-        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             x = x + y * e[2].squeeze(2)
         # cross-attention & ffn function
@@ -463,7 +463,7 @@ class WanAttentionBlock(nn.Module):
             x = x + self.cross_attn(self.norm3(x), context, context_lens)
             y = self.ffn(
                 self.norm2(x).bfloat16() * (1 + e[4].squeeze(2)) + e[3].squeeze(2))
-            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                 x = x + y * e[5].squeeze(2)
             return x
@@ -495,8 +495,8 @@ class Head(nn.Module):
             e(Tensor): Shape [B, L, C]
         """
         assert e.dtype == torch.bfloat16
-        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
-            e = (self.modulation.unsqueeze(0) + e.unsqueeze(2)).chunk(2, dim=2) # 1 1 2 D, B L 1 D -> B L 2 D -> 2 * (B L 1 D)
             x = (self.head(self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)))
         return x
@@ -740,7 +740,7 @@ class WanModel(ModelMixin, ConfigMixin):
                 # print(f"zeroing out first {_first_images_seq_len} from t: {t.shape}, {t}")
             else:
                 t = t.unsqueeze(1).expand(t.size(0), seq_len)
-        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
             bt = t.size(0)
             t = t.flatten()
             e = self.time_embedding(

 import math
 import torch
+import torch.amp as amp
 import torch.nn as nn
 import torch.nn.functional as F
     return x
+@amp.autocast('cuda', enabled=False)
 def rope_params(max_seq_len, dim, theta=10000, freqs_scaling=1.0):
     assert dim % 2 == 0
     pos =  torch.arange(max_seq_len)
     freqs = torch.polar(torch.ones_like(freqs), freqs)
     return freqs
+@amp.autocast('cuda', enabled=False)
 def rope_apply_1d(x, grid_sizes, freqs):
     n, c = x.size(2), x.size(3) // 2 ## b l h d
     c_rope = freqs.shape[1]  # number of complex dims to rotate
         output.append(x_i)
     return torch.stack(output).bfloat16()
+@amp.autocast('cuda', enabled=False)
 def rope_apply_3d(x, grid_sizes, freqs):
     n, c = x.size(2), x.size(3) // 2
         output.append(x_i)
     return torch.stack(output).bfloat16()
+@amp.autocast('cuda', enabled=False)
 def rope_apply(x, grid_sizes, freqs):
     x_ndim = grid_sizes.shape[-1]
     if x_ndim == 3:
         Args:
             x(Tensor): Shape [B, L, C]
         """
+        return self._norm(x.bfloat16()).type_as(x) * self.weight.bfloat16()
     def _norm(self, x):
         return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
         self.modulation = nn.Parameter(torch.randn(1, num, dim) / dim**0.5)
     def forward(self, e):
+        return self.modulation.bfloat16() + e.bfloat16()
 class WanAttentionBlock(nn.Module):
         """
         assert e.dtype == torch.bfloat16
         assert len(e.shape) == 4 and e.size(2) == 6 and e.shape[1] == x.shape[1], f"{e.shape}, {x.shape}"
+        with amp.autocast('cuda', dtype=torch.bfloat16):
             e = self.modulation(e).chunk(6, dim=2)
         assert e[0].dtype == torch.bfloat16
         y = self.self_attn(
             self.norm1(x).bfloat16() * (1 + e[1].squeeze(2)) + e[0].squeeze(2),
             seq_lens, grid_sizes, freqs)
+        with amp.autocast('cuda', dtype=torch.bfloat16):
             x = x + y * e[2].squeeze(2)
         # cross-attention & ffn function
             x = x + self.cross_attn(self.norm3(x), context, context_lens)
             y = self.ffn(
                 self.norm2(x).bfloat16() * (1 + e[4].squeeze(2)) + e[3].squeeze(2))
+            with amp.autocast('cuda', dtype=torch.bfloat16):
                 x = x + y * e[5].squeeze(2)
             return x
             e(Tensor): Shape [B, L, C]
         """
         assert e.dtype == torch.bfloat16
+        with amp.autocast('cuda', dtype=torch.bfloat16):
+            e = (self.modulation.bfloat16().unsqueeze(0) + e.unsqueeze(2)).chunk(2, dim=2) # 1 1 2 D, B L 1 D -> B L 2 D -> 2 * (B L 1 D)
             x = (self.head(self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)))
         return x
                 # print(f"zeroing out first {_first_images_seq_len} from t: {t.shape}, {t}")
             else:
                 t = t.unsqueeze(1).expand(t.size(0), seq_len)
+        with amp.autocast('cuda', dtype=torch.bfloat16):
             bt = t.size(0)
             t = t.flatten()
             e = self.time_embedding(

ovi/modules/vae.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 import torch
-import torch.cuda.amp as amp
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
@@ -672,14 +672,14 @@ class WanVAE:
         """
         videos: A list of videos each with shape [C, T, H, W].
         """
-        with amp.autocast(dtype=self.dtype):
             return [
                 self.model.encode(u.unsqueeze(0), self.scale).float().squeeze(0)
                 for u in videos
             ]
     def decode(self, zs):
-        with amp.autocast(dtype=self.dtype):
             return [
                 self.model.decode(u.unsqueeze(0),
                                   self.scale).float().clamp_(-1, 1).squeeze(0)
@@ -688,16 +688,16 @@ class WanVAE:
     @torch.no_grad()
     def wrapped_decode(self, z):
-        with torch.amp.autocast('cuda', dtype=self.dtype):
             return self.model.decode(z, self.scale).float().clamp_(-1, 1)
     @torch.no_grad()
     def wrapped_decode_stream(self, z):
-        with torch.amp.autocast('cuda', dtype=self.dtype):
             return self.model.decode_stream(z, self.scale).float().clamp_(-1, 1)
     @torch.no_grad()
     def wrapped_encode(self, video):
-        with torch.amp.autocast('cuda', dtype=self.dtype):
             return self.model.encode(video, self.scale).float()

 import logging
 import torch
+import torch.amp as amp
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
         """
         videos: A list of videos each with shape [C, T, H, W].
         """
+        with amp.autocast('cuda', dtype=self.dtype):
             return [
                 self.model.encode(u.unsqueeze(0), self.scale).float().squeeze(0)
                 for u in videos
             ]
     def decode(self, zs):
+        with amp.autocast('cuda', dtype=self.dtype):
             return [
                 self.model.decode(u.unsqueeze(0),
                                   self.scale).float().clamp_(-1, 1).squeeze(0)
     @torch.no_grad()
     def wrapped_decode(self, z):
+        with amp.autocast('cuda', dtype=self.dtype):
             return self.model.decode(z, self.scale).float().clamp_(-1, 1)
     @torch.no_grad()
     def wrapped_decode_stream(self, z):
+        with amp.autocast('cuda', dtype=self.dtype):
             return self.model.decode_stream(z, self.scale).float().clamp_(-1, 1)
     @torch.no_grad()
     def wrapped_encode(self, video):
+        with amp.autocast('cuda', dtype=self.dtype):
             return self.model.encode(video, self.scale).float()

ovi/modules/vae2_2.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 import torch
-import torch.cuda.amp as amp
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
@@ -1025,7 +1025,7 @@ class Wan2_2_VAE:
         try:
             if not isinstance(videos, list):
                 raise TypeError("videos should be a list")
-            with amp.autocast(dtype=self.dtype):
                 return [
                     self.model.encode(u.unsqueeze(0),
                                       self.scale).float().squeeze(0)
@@ -1039,7 +1039,7 @@ class Wan2_2_VAE:
         try:
             if not isinstance(zs, list):
                 raise TypeError("zs should be a list")
-            with amp.autocast(dtype=self.dtype):
                 return [
                     self.model.decode(u.unsqueeze(0),
                                       self.scale).float().clamp_(-1,
@@ -1054,7 +1054,7 @@ class Wan2_2_VAE:
         try:
             if not isinstance(zs, torch.Tensor):
                 raise TypeError("zs should be a torch.Tensor")
-            with amp.autocast(dtype=self.dtype):
                 return self.model.decode(zs, self.scale).float().clamp_(-1,
                                                                  1)
@@ -1066,7 +1066,7 @@ class Wan2_2_VAE:
         try:
             if not isinstance(video, torch.Tensor):
                 raise TypeError("video should be a torch.Tensor")
-            with amp.autocast(dtype=self.dtype):
                 return self.model.encode(video, self.scale).float()

 import logging
 import torch
+import torch.amp as amp
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
         try:
             if not isinstance(videos, list):
                 raise TypeError("videos should be a list")
+            with amp.autocast('cuda', dtype=self.dtype):
                 return [
                     self.model.encode(u.unsqueeze(0),
                                       self.scale).float().squeeze(0)
         try:
             if not isinstance(zs, list):
                 raise TypeError("zs should be a list")
+            with amp.autocast('cuda', dtype=self.dtype):
                 return [
                     self.model.decode(u.unsqueeze(0),
                                       self.scale).float().clamp_(-1,
         try:
             if not isinstance(zs, torch.Tensor):
                 raise TypeError("zs should be a torch.Tensor")
+            with amp.autocast('cuda', dtype=self.dtype):
                 return self.model.decode(zs, self.scale).float().clamp_(-1,
                                                                  1)
         try:
             if not isinstance(video, torch.Tensor):
                 raise TypeError("video should be a torch.Tensor")
+            with amp.autocast('cuda', dtype=self.dtype):
                 return self.model.encode(video, self.scale).float()