Spaces:

PAIR
/

Text2Video-Zero

Running on A10G

App Files Files Community

lev1 commited on Mar 25, 2023

Commit

2d7762b

•

1 Parent(s): a681a6f

T2V Tab improvements

Browse files

Files changed (4) hide show

app_text_to_video.py +31 -12
model.py +56 -10
text_to_video/text_to_video_generator.py +2 -0
text_to_video/text_to_video_pipeline.py +7 -68

app_text_to_video.py CHANGED Viewed

@@ -1,16 +1,17 @@
 import gradio as gr
 from model import Model
 examples = [
-    "an astronaut waving the arm on the moon",
-    "a sloth surfing on a wakeboard",
-            "an astronaut walking on a street",
-            "a cute cat walking on grass",
-    "a horse is galloping on a street",
-   "an astronaut is skiing down the hill",
-    "a gorilla walking alone down the street"
-    "a gorilla dancing on times square",
-    "A panda dancing dancing like crazy on Times Square",
     ]
@@ -24,17 +25,35 @@ def create_demo(model: Model):
             with gr.Column():
                 prompt = gr.Textbox(label='Prompt')
                 run_button = gr.Button(label='Run')
             with gr.Column():
                 result = gr.Video(label="Generated Video")
         inputs = [
-            prompt,
         ]
         gr.Examples(examples=examples,
                 inputs=inputs,
                 outputs=result,
-                cache_examples=False,
-                #cache_examples=os.getenv('SYSTEM') == 'spaces')
                 run_on_click=False,
         )

 import gradio as gr
 from model import Model
+from functools import partial
 examples = [
+    ["an astronaut waving the arm on the moon"],
+    ["a sloth surfing on a wakeboard"],
+    ["an astronaut walking on a street"],
+    ["a cute cat walking on grass"],
+    ["a horse is galloping on a street"],
+    ["an astronaut is skiing down the hill"],
+    ["a gorilla walking alone down the street"],
+    ["a gorilla dancing on times square"],
+    ["A panda dancing dancing like crazy on Times Square"],
     ]
             with gr.Column():
                 prompt = gr.Textbox(label='Prompt')
                 run_button = gr.Button(label='Run')
+                with gr.Accordion('Advanced options', open=False):
+                    motion_field_strength_x = gr.Slider(label='Global Translation $\delta_{x}$',
+                                     minimum=-20,
+                                     maximum=20,
+                                     value=12,
+                                     step=1)
+                    motion_field_strength_y = gr.Slider(label='Global Translation $\delta_{y}$',
+                                     minimum=-20,
+                                     maximum=20,
+                                     value=12,
+                                     step=1)
+                #     a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
+                    n_prompt = gr.Textbox(label="Optional Negative Prompt",
+                                           value='')
             with gr.Column():
                 result = gr.Video(label="Generated Video")
         inputs = [
+            prompt,
+            motion_field_strength_x,
+            motion_field_strength_y,
+            n_prompt
         ]
         gr.Examples(examples=examples,
                 inputs=inputs,
                 outputs=result,
+                # cache_examples=False,
+                cache_examples=os.getenv('SYSTEM') == 'spaces',
                 run_on_click=False,
         )

model.py CHANGED Viewed

@@ -255,26 +255,71 @@ class Model:
                                 )
         return utils.create_video(result, fps)
-    def process_text2video(self, prompt, resolution=512, seed=24, num_frames=8, fps=4, t0=881, t1=941,
-                           use_cf_attn=True, use_motion_field=True, use_foreground_motion_field=False,
-                           smooth_bg=False, smooth_bg_strength=0.4, motion_field_strength=12):
         if self.model_type != ModelType.Text2Video:
-            unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
             self.set_model(ModelType.Text2Video, model_id="runwayml/stable-diffusion-v1-5", unet=unet)
             self.pipe.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config)
-            self.pipe.unet.set_attn_processor(processor=self.text2video_attn_proc)
             self.generator.manual_seed(seed)
         added_prompt = "high quality, HD, 8K, trending on artstation, high focus, dramatic lighting"
-        self.generator.manual_seed(seed)
         prompt = prompt.rstrip()
         if len(prompt) > 0 and (prompt[-1] == "," or prompt[-1]  == "."):
             prompt = prompt.rstrip()[:-1]
         prompt = prompt.rstrip()
         prompt = prompt + ", "+added_prompt
         result = self.inference(prompt=[prompt],
                                 video_length=num_frames,
@@ -285,12 +330,13 @@ class Model:
                                 guidance_stop_step=1.0,
                                 t0=t0,
                                 t1=t1,
-                                use_foreground_motion_field=use_foreground_motion_field,
-                                motion_field_strength=motion_field_strength,
                                 use_motion_field=use_motion_field,
                                 smooth_bg=smooth_bg,
                                 smooth_bg_strength=smooth_bg_strength,
                                 seed=seed,
                                 output_type='numpy',
                                 )
-        return utils.create_video(result, fps)

                                 )
         return utils.create_video(result, fps)
+    # def process_text2video(self, prompt, resolution=512, seed=24, num_frames=8, fps=4, t0=881, t1=941,
+    #                        use_cf_attn=True, use_motion_field=True, use_foreground_motion_field=False,
+    #                        smooth_bg=False, smooth_bg_strength=0.4, motion_field_strength=12):
+    #     if self.model_type != ModelType.Text2Video:
+    #         unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+    #         self.set_model(ModelType.Text2Video, model_id="runwayml/stable-diffusion-v1-5", unet=unet)
+    #         self.pipe.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config)
+    #         self.pipe.unet.set_attn_processor(processor=self.text2video_attn_proc)
+    #         self.generator.manual_seed(seed)
+    #     added_prompt = "high quality, HD, 8K, trending on artstation, high focus, dramatic lighting"
+    #     self.generator.manual_seed(seed)
+    #     prompt = prompt.rstrip()
+    #     if len(prompt) > 0 and (prompt[-1] == "," or prompt[-1]  == "."):
+    #         prompt = prompt.rstrip()[:-1]
+    #     prompt = prompt.rstrip()
+    #     prompt = prompt + ", "+added_prompt
+    #     result = self.inference(prompt=[prompt],
+    #                             video_length=num_frames,
+    #                             height=resolution,
+    #                             width=resolution,
+    #                             num_inference_steps=50,
+    #                             guidance_scale=7.5,
+    #                             guidance_stop_step=1.0,
+    #                             t0=t0,
+    #                             t1=t1,
+    #                             use_foreground_motion_field=use_foreground_motion_field,
+    #                             motion_field_strength=motion_field_strength,
+    #                             use_motion_field=use_motion_field,
+    #                             smooth_bg=smooth_bg,
+    #                             smooth_bg_strength=smooth_bg_strength,
+    #                             seed=seed,
+    #                             output_type='numpy',
+    #                             )
+    #     return utils.create_video(result, fps)
+    def process_text2video(self, prompt, motion_field_strength_x=12,motion_field_strength_y=12, n_prompt="", resolution=512, seed=24, num_frames=8, fps=4, t0=881, t1=941,
+                           use_cf_attn=True, use_motion_field=True,
+                           smooth_bg=False, smooth_bg_strength=0.4 ):
         if self.model_type != ModelType.Text2Video:
+            unet = UNet2DConditionModel.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder="unet")
             self.set_model(ModelType.Text2Video, model_id="runwayml/stable-diffusion-v1-5", unet=unet)
             self.pipe.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config)
+            if use_cf_attn:
+                self.pipe.unet.set_attn_processor(processor=self.text2video_attn_proc)
             self.generator.manual_seed(seed)
         added_prompt = "high quality, HD, 8K, trending on artstation, high focus, dramatic lighting"
+        negative_prompts = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic'
         prompt = prompt.rstrip()
         if len(prompt) > 0 and (prompt[-1] == "," or prompt[-1]  == "."):
             prompt = prompt.rstrip()[:-1]
         prompt = prompt.rstrip()
         prompt = prompt + ", "+added_prompt
+        if len(n_prompt)>0:
+            negative_prompt = [n_prompt]
+        else:
+            negative_prompt = None
         result = self.inference(prompt=[prompt],
                                 video_length=num_frames,
                                 guidance_stop_step=1.0,
                                 t0=t0,
                                 t1=t1,
+                                motion_field_strength_x=motion_field_strength_x,
+                                motion_field_strength_y=motion_field_strength_y,
                                 use_motion_field=use_motion_field,
                                 smooth_bg=smooth_bg,
                                 smooth_bg_strength=smooth_bg_strength,
                                 seed=seed,
                                 output_type='numpy',
+                                negative_prompt = negative_prompt,
                                 )
+        return utils.create_video(result, fps)

text_to_video/text_to_video_generator.py CHANGED Viewed

@@ -13,6 +13,8 @@ class TextToVideo():
         g.manual_seed(22)
         self.g = g
         print(f"Loading model SD-Net model file from {sd_path}")
         self.dtype = torch.float16

         g.manual_seed(22)
         self.g = g
+        assert sd_path is not None
         print(f"Loading model SD-Net model file from {sd_path}")
         self.dtype = torch.float16

text_to_video/text_to_video_pipeline.py CHANGED Viewed

@@ -142,7 +142,6 @@ class TextToVideoPipeline(StableDiffusionPipeline):
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if t > skip_t:
-                    # print("Skipping frame!")
                     continue
                 else:
                     if not entered:
@@ -235,19 +234,20 @@ class TextToVideoPipeline(StableDiffusionPipeline):
                                   List[torch.Generator]]] = None,
         xT: Optional[torch.FloatTensor] = None,
         null_embs: Optional[torch.FloatTensor] = None,
-        motion_field_strength: float = 12,
         output_type: Optional[str] = "tensor",
         return_dict: bool = True,
         callback: Optional[Callable[[
             int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
-        use_foreground_motion_field: bool = True,
         use_motion_field: bool = True,
         smooth_bg: bool = True,
         smooth_bg_strength: float = 0.4,
         **kwargs,
     ):
         print(f" Use: Motion field = {use_motion_field}")
         print(f" Use: Background smoothing = {smooth_bg}")
         # Default height and width to unet
@@ -349,7 +349,9 @@ class TextToVideoPipeline(StableDiffusionPipeline):
             reference_flow = torch.zeros(
                 (video_length-1, 2, 512, 512), device=x_t0_1.device, dtype=x_t0_1.dtype)
             for fr_idx in range(video_length-1):
-                reference_flow[fr_idx, :, :, :] = motion_field_strength*(fr_idx+1)
             for idx, latent in enumerate(x_t0_k):
                 x_t0_k[idx] = self.warp_latents_independently(
@@ -379,63 +381,6 @@ class TextToVideoPipeline(StableDiffusionPipeline):
             x_t0_k = x_t0_1[:, :, 1:, :, :].clone()
             x_t0_1 = x_t0_1[:,:,:1,:,:].clone()
-        move_object = use_foreground_motion_field
-        if move_object:
-            h, w = x0.shape[3], x0.shape[4]
-            # Move object
-            # reference_flow = torch.zeros(
-            #       (video_length-1, 2, 512, 512), device=x_t0_1.device, dtype=x_t0_1.dtype)
-            reference_flow_obj = torch.zeros(
-                (batch_size, video_length, 2, 512, 512), device=x_t0_1.device, dtype=x_t0_1.dtype)
-            for batch_idx, x0_b in enumerate(x0):
-                tmp = x0_b[None]
-                z0_b = []
-                for fr_split in range(tmp.shape[2]):
-                    z0_b.append(self.decode_latents(
-                        tmp[:, :, fr_split, None]).detach())
-                z0_b = torch.cat(z0_b, dim=2)
-                z0_b = rearrange(z0_b[0], "c f h w -> f h w c")
-                shift = (-5 - 5) * torch.rand(2,
-                                              device=x0.device, dtype=x0.dtype) + 5
-                for frame_idx, z0_f in enumerate(z0_b):
-                    if frame_idx > 0:
-                        z0_f = torch.round(
-                            z0_f * 255).cpu().numpy().astype(np.uint8)
-                        # apply SOD detection to obtain mask of foreground object
-                        m_f = torch.tensor(self.sod_model.process_data(
-                            z0_f), device=x0.device).to(x0.dtype)
-                        kernel = torch.ones(
-                            5, 5, device=x0.device, dtype=x0.dtype)
-                        mask = dilation(
-                            m_f[None, None].to(x0.device), kernel)[0]
-                        for coord_idx in range(2):
-                            reference_flow_obj[batch_idx, frame_idx,
-                                               coord_idx, :, :] = (1+frame_idx) * shift[coord_idx] * mask
-            for idx, x_t0_k_b in enumerate(x_t0_k):
-                x_t0_k[idx] = self.warp_latents_independently(
-                    x_t0_k_b[None], reference_flow_obj[idx, 1:])
-            x_t1_k = self.DDPM_forward(
-                x0=x_t0_k, t0=t0, tMax=t1, device=device, shape=shape, text_embeddings=text_embeddings, generator=generator)
-            if x_t1_1 is None:
-                raise Exception
-            x_t1 = torch.cat([x_t1_1, x_t1_k], dim=2)
-            # del latent
-            ddim_res = self.DDIM_backward(num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=t1, t0=-1, t1=-1, do_classifier_free_guidance=do_classifier_free_guidance,
-                                          null_embs=null_embs, text_embeddings=text_embeddings, latents_local=x_t1, latents_dtype=dtype, guidance_scale=guidance_scale, guidance_stop_step=guidance_stop_step, callback=callback, callback_steps=callback_steps, extra_step_kwargs=extra_step_kwargs, num_warmup_steps=num_warmup_steps)
-            x0 = ddim_res["x0"].detach()
-            del ddim_res
         # smooth background
         if smooth_bg:
             h, w = x0.shape[3], x0.shape[4]
@@ -474,9 +419,6 @@ class TextToVideoPipeline(StableDiffusionPipeline):
                         x_t1_fg_masked_b, reference_flow)
                 else:
                     x_t1_fg_masked_b = x_t1_fg_masked_b[None]
-                if move_object:
-                    x_t1_fg_masked_b = self.warp_latents_independently(
-                        x_t1_fg_masked_b, reference_flow_obj[batch_idx, 1:])
                 x_t1_fg_masked_b = torch.cat(
                     [x_t1_1_fg_masked_b[None], x_t1_fg_masked_b], dim=2)
@@ -493,9 +435,6 @@ class TextToVideoPipeline(StableDiffusionPipeline):
                 if  use_motion_field:
                     m_fg_b = self.warp_latents_independently(
                         m_fg_b.clone(), reference_flow)
-                if move_object:
-                    m_fg_b = self.warp_latents_independently(
-                        m_fg_b, reference_flow_obj[batch_idx, 1:])
                 M_FG_warped.append(
                     torch.cat([m_fg_1_b[:1, 0], m_fg_b[:1, 0]], dim=1))

         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if t > skip_t:
                     continue
                 else:
                     if not entered:
                                   List[torch.Generator]]] = None,
         xT: Optional[torch.FloatTensor] = None,
         null_embs: Optional[torch.FloatTensor] = None,
+        #motion_field_strength: float = 12,
+        motion_field_strength_x: float = 12,
+        motion_field_strength_y: float = 12,
         output_type: Optional[str] = "tensor",
         return_dict: bool = True,
         callback: Optional[Callable[[
             int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
         use_motion_field: bool = True,
         smooth_bg: bool = True,
         smooth_bg_strength: float = 0.4,
         **kwargs,
     ):
+        print(motion_field_strength_x,motion_field_strength_y)
         print(f" Use: Motion field = {use_motion_field}")
         print(f" Use: Background smoothing = {smooth_bg}")
         # Default height and width to unet
             reference_flow = torch.zeros(
                 (video_length-1, 2, 512, 512), device=x_t0_1.device, dtype=x_t0_1.dtype)
             for fr_idx in range(video_length-1):
+                #reference_flow[fr_idx, :, :, :] = motion_field_strength*(fr_idx+1)
+                reference_flow[fr_idx, 0, :, :] = motion_field_strength_x*(fr_idx+1)
+                reference_flow[fr_idx, 1, :, :] = motion_field_strength_y*(fr_idx+1)
             for idx, latent in enumerate(x_t0_k):
                 x_t0_k[idx] = self.warp_latents_independently(
             x_t0_k = x_t0_1[:, :, 1:, :, :].clone()
             x_t0_1 = x_t0_1[:,:,:1,:,:].clone()
         # smooth background
         if smooth_bg:
             h, w = x0.shape[3], x0.shape[4]
                         x_t1_fg_masked_b, reference_flow)
                 else:
                     x_t1_fg_masked_b = x_t1_fg_masked_b[None]
                 x_t1_fg_masked_b = torch.cat(
                     [x_t1_1_fg_masked_b[None], x_t1_fg_masked_b], dim=2)
                 if  use_motion_field:
                     m_fg_b = self.warp_latents_independently(
                         m_fg_b.clone(), reference_flow)
                 M_FG_warped.append(
                     torch.cat([m_fg_1_b[:1, 0], m_fg_b[:1, 0]], dim=1))