Spaces:

PAIR
/

Text2Video-Zero

Runtime error

App Files Files Community

lev1 commited on Apr 10, 2023

Commit

62cb566

1 Parent(s): cdcc7cc

Depth guided generation

Browse files

Files changed (5) hide show

app.py +3 -0
app_canny.py +7 -7
app_depth.py +77 -0
model.py +71 -0
utils.py +29 -9

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from app_pose import create_demo as create_demo_pose
 from app_text_to_video import create_demo as create_demo_text_to_video
 from app_pix2pix_video import create_demo as create_demo_pix2pix_video
 from app_canny_db import create_demo as create_demo_canny_db
 import argparse
 import os
@@ -62,6 +63,8 @@ with gr.Blocks(css='style.css') as demo:
         create_demo_canny(model)
     with gr.Tab('Edge Conditional and Dreambooth Specialized'):
         create_demo_canny_db(model)
     '''
     '''
     gr.HTML(

 from app_text_to_video import create_demo as create_demo_text_to_video
 from app_pix2pix_video import create_demo as create_demo_pix2pix_video
 from app_canny_db import create_demo as create_demo_canny_db
+from app_depth import create_demo as create_demo_depth
 import argparse
 import os
         create_demo_canny(model)
     with gr.Tab('Edge Conditional and Dreambooth Specialized'):
         create_demo_canny_db(model)
+    with gr.Tab('Depth Conditional'):
+        create_demo_depth(model)
     '''
     '''
     gr.HTML(

app_canny.py CHANGED Viewed

@@ -7,19 +7,19 @@ on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
 def create_demo(model: Model):
     examples = [
-        ["__assets__/canny_videos_edge_2fps/butterfly.mp4",
             "white butterfly, a high-quality, detailed, and professional photo"],
-        ["__assets__/canny_videos_edge_2fps/deer.mp4",
             "oil painting of a deer, a high-quality, detailed, and professional photo"],
-        ["__assets__/canny_videos_edge_2fps/fox.mp4",
             "wild red fox is walking on the grass, a high-quality, detailed, and professional photo"],
-        ["__assets__/canny_videos_edge_2fps/girl_dancing.mp4",
             "oil painting of a girl dancing close-up, masterpiece, a high-quality, detailed, and professional photo"],
-        ["__assets__/canny_videos_edge_2fps/girl_turning.mp4",
             "oil painting of a beautiful girl, a high-quality, detailed, and professional photo"],
-        ["__assets__/canny_videos_edge_2fps/halloween.mp4",
             "beautiful girl halloween style, a high-quality, detailed, and professional photo"],
-        ["__assets__/canny_videos_edge_2fps/santa.mp4",
             "a santa claus, a high-quality, detailed, and professional photo"],
     ]

 def create_demo(model: Model):
     examples = [
+        ["__assets__/canny_videos_edge/butterfly.mp4",
             "white butterfly, a high-quality, detailed, and professional photo"],
+        ["__assets__/canny_videos_edge/deer.mp4",
             "oil painting of a deer, a high-quality, detailed, and professional photo"],
+        ["__assets__/canny_videos_edge/fox.mp4",
             "wild red fox is walking on the grass, a high-quality, detailed, and professional photo"],
+        ["__assets__/canny_videos_edge/girl_dancing.mp4",
             "oil painting of a girl dancing close-up, masterpiece, a high-quality, detailed, and professional photo"],
+        ["__assets__/canny_videos_edge/girl_turning.mp4",
             "oil painting of a beautiful girl, a high-quality, detailed, and professional photo"],
+        ["__assets__/canny_videos_edge/halloween.mp4",
             "beautiful girl halloween style, a high-quality, detailed, and professional photo"],
+        ["__assets__/canny_videos_edge/santa.mp4",
             "a santa claus, a high-quality, detailed, and professional photo"],
     ]

app_depth.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gradio as gr
+from model import Model
+import os
+on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
+def create_demo(model: Model):
+    examples = [
+        ["__assets__/depth_videos/butterfly.mp4",
+            "white butterfly, a high-quality, detailed, and professional photo"],
+        ["__assets__/depth_videos/deer.mp4",
+            "oil painting of a deer, a high-quality, detailed, and professional photo"],
+        ["__assets__/depth_videos/fox.mp4",
+            "wild red fox is walking on the grass, a high-quality, detailed, and professional photo"],
+        ["__assets__/depth_videos/girl_dancing.mp4",
+            "oil painting of a girl dancing close-up, masterpiece, a high-quality, detailed, and professional photo"],
+        ["__assets__/depth_videos/girl_turning.mp4",
+            "oil painting of a beautiful girl, a high-quality, detailed, and professional photo"],
+        ["__assets__/depth_videos/halloween.mp4",
+            "beautiful girl halloween style, a high-quality, detailed, and professional photo"],
+        ["__assets__/depth_videos/santa.mp4",
+            "a santa claus, a high-quality, detailed, and professional photo"],
+    ]
+    with gr.Blocks() as demo:
+        with gr.Row():
+            gr.Markdown('## Text and Depth Conditional Video Generation')
+        with gr.Row():
+            gr.HTML(
+                """
+                <div style="text-align: left; auto;">
+                <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+                    Description: For performance purposes, our current preview release supports any input videos but caps output videos after 80 frames and the input videos are scaled down before processing.
+                </h3>
+                </div>
+                """)
+        with gr.Row():
+            with gr.Column():
+                input_video = gr.Video(
+                    label="Input Video", source='upload', format="mp4", visible=True).style(height="auto")
+            with gr.Column():
+                prompt = gr.Textbox(label='Prompt')
+                run_button = gr.Button(label='Run')
+                with gr.Accordion('Advanced options', open=False):
+                    watermark = gr.Radio(["Picsart AI Research", "Text2Video-Zero",
+                                         "None"], label="Watermark", value='Picsart AI Research')
+                    chunk_size = gr.Slider(
+                        label="Chunk size", minimum=2, maximum=16, value=8, step=1, visible=not on_huggingspace,
+                        info="Number of frames processed at once. Reduce for lower memory usage.")
+                    merging_ratio = gr.Slider(
+                        label="Merging ratio", minimum=0.0, maximum=0.9, step=0.1, value=0.0, visible=not on_huggingspace,
+                        info="Ratio of how many tokens are merged. The higher the more compression (less memory and faster inference).")
+            with gr.Column():
+                result = gr.Video(label="Generated Video").style(height="auto")
+        inputs = [
+            input_video,
+            prompt,
+            chunk_size,
+            watermark,
+            merging_ratio,
+        ]
+        gr.Examples(examples=examples,
+                    inputs=inputs,
+                    outputs=result,
+                    fn=model.process_controlnet_depth,
+                    cache_examples=on_huggingspace,
+                    run_on_click=False,
+                    )
+        run_button.click(fn=model.process_controlnet_depth,
+                         inputs=inputs,
+                         outputs=result,)
+    return demo

model.py CHANGED Viewed

@@ -13,6 +13,8 @@ import gradio_utils
 import os
 on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
 class ModelType(Enum):
     Pix2Pix_Video = 1,
@@ -20,6 +22,7 @@ class ModelType(Enum):
     ControlNetCanny = 3,
     ControlNetCannyDB = 4,
     ControlNetPose = 5,
 class Model:
@@ -33,6 +36,7 @@ class Model:
             ModelType.ControlNetCanny: StableDiffusionControlNetPipeline,
             ModelType.ControlNetCannyDB: StableDiffusionControlNetPipeline,
             ModelType.ControlNetPose: StableDiffusionControlNetPipeline,
         }
         self.controlnet_attn_proc = utils.CrossFrameAttnProcessor(
             unet_chunk_size=2)
@@ -165,6 +169,73 @@ class Model:
             video_path, resolution, self.device, self.dtype, False)
         control = utils.pre_process_canny(
             video, low_threshold, high_threshold).to(self.device).to(self.dtype)
         f, _, h, w = video.shape
         self.generator.manual_seed(seed)
         latents = torch.randn((1, 4, h//8, w//8), dtype=self.dtype,

 import os
 on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
+from einops import rearrange
 class ModelType(Enum):
     Pix2Pix_Video = 1,
     ControlNetCanny = 3,
     ControlNetCannyDB = 4,
     ControlNetPose = 5,
+    ControlNetDepth = 6,
 class Model:
             ModelType.ControlNetCanny: StableDiffusionControlNetPipeline,
             ModelType.ControlNetCannyDB: StableDiffusionControlNetPipeline,
             ModelType.ControlNetPose: StableDiffusionControlNetPipeline,
+            ModelType.ControlNetDepth: StableDiffusionControlNetPipeline,
         }
         self.controlnet_attn_proc = utils.CrossFrameAttnProcessor(
             unet_chunk_size=2)
             video_path, resolution, self.device, self.dtype, False)
         control = utils.pre_process_canny(
             video, low_threshold, high_threshold).to(self.device).to(self.dtype)
+        # canny_to_save = list(rearrange(control, 'f c w h -> f w h c').cpu().detach().numpy())
+        # _ = utils.create_video(canny_to_save, 4, path="ddxk.mp4", watermark=None)
+        f, _, h, w = video.shape
+        self.generator.manual_seed(seed)
+        latents = torch.randn((1, 4, h//8, w//8), dtype=self.dtype,
+                              device=self.device, generator=self.generator)
+        latents = latents.repeat(f, 1, 1, 1)
+        result = self.inference(image=control,
+                                prompt=prompt + ', ' + added_prompt,
+                                height=h,
+                                width=w,
+                                negative_prompt=negative_prompts,
+                                num_inference_steps=num_inference_steps,
+                                guidance_scale=guidance_scale,
+                                controlnet_conditioning_scale=controlnet_conditioning_scale,
+                                eta=eta,
+                                latents=latents,
+                                seed=seed,
+                                output_type='numpy',
+                                split_to_chunks=True,
+                                chunk_size=chunk_size,
+                                merging_ratio=merging_ratio,
+                                )
+        return utils.create_video(result, fps, path=save_path, watermark=gradio_utils.logo_name_to_path(watermark))
+    def process_controlnet_depth(self,
+                                 video_path,
+                                 prompt,
+                                 chunk_size=8,
+                                 watermark='Picsart AI Research',
+                                 merging_ratio=0.0,
+                                 num_inference_steps=20,
+                                 controlnet_conditioning_scale=1.0,
+                                 guidance_scale=9.0,
+                                 seed=42,
+                                 eta=0.0,
+                                 resolution=512,
+                                 use_cf_attn=True,
+                                 save_path=None):
+        print("Module Depth")
+        video_path = gradio_utils.edge_path_to_video_path(video_path)
+        if self.model_type != ModelType.ControlNetDepth:
+            controlnet = ControlNetModel.from_pretrained(
+                "lllyasviel/sd-controlnet-depth")
+            self.set_model(ModelType.ControlNetDepth,
+                           model_id="runwayml/stable-diffusion-v1-5", controlnet=controlnet)
+            self.pipe.scheduler = DDIMScheduler.from_config(
+                self.pipe.scheduler.config)
+            if use_cf_attn:
+                self.pipe.unet.set_attn_processor(
+                    processor=self.controlnet_attn_proc)
+                self.pipe.controlnet.set_attn_processor(
+                    processor=self.controlnet_attn_proc)
+        # added_prompt = 'best quality, extremely detailed'
+        # negative_prompts = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
+        video, fps = utils.prepare_video(
+            video_path, resolution, self.device, self.dtype, False)
+        control = utils.pre_process_depth(
+            video).to(self.device).to(self.dtype)
+        depth_map_to_save = list(rearrange(control, 'f c w h -> f w h c').cpu().detach().numpy())
+        _ = utils.create_video(depth_map_to_save, 4, path="ddxk.mp4", watermark=None)
         f, _, h, w = video.shape
         self.generator.manual_seed(seed)
         latents = torch.randn((1, 4, h//8, w//8), dtype=self.dtype,

utils.py CHANGED Viewed

@@ -12,11 +12,12 @@ from PIL import Image
 from annotator.util import resize_image, HWC3
 from annotator.canny import CannyDetector
 from annotator.openpose import OpenposeDetector
 import decord
-# decord.bridge.set_bridge('torch')
 apply_canny = CannyDetector()
 apply_openpose = OpenposeDetector()
 def add_watermark(image, watermark_path, wm_rel_size=1/16, boundary=5):
@@ -55,6 +56,24 @@ def pre_process_canny(input_video, low_threshold=100, high_threshold=200):
     return rearrange(control, 'f h w c -> f c h w')
 def pre_process_pose(input_video, apply_pose_detect: bool = True):
     detected_maps = []
     for frame in input_video:
@@ -137,14 +156,15 @@ def prepare_video(video_path:str, resolution:int, device, dtype, normalize=True,
     _, h, w, _ = video.shape
     video = rearrange(video, "f h w c -> f c h w")
     video = torch.Tensor(video).to(device).to(dtype)
-    if h > w:
-        w = int(w * resolution / h)
-        w = w - w % 8
-        h = resolution - resolution % 8
-    else:
-        h = int(h * resolution / w)
-        h = h - h % 8
-        w = resolution - resolution % 8
     video = Resize((h, w), interpolation=InterpolationMode.BILINEAR, antialias=True)(video)
     if normalize:
         video = video / 127.5 - 1.0

 from annotator.util import resize_image, HWC3
 from annotator.canny import CannyDetector
 from annotator.openpose import OpenposeDetector
+from annotator.midas import MidasDetector
 import decord
 apply_canny = CannyDetector()
 apply_openpose = OpenposeDetector()
+apply_midas = MidasDetector()
 def add_watermark(image, watermark_path, wm_rel_size=1/16, boundary=5):
     return rearrange(control, 'f h w c -> f c h w')
+def pre_process_depth(input_video, apply_depth_detect: bool = True):
+    detected_maps = []
+    for frame in input_video:
+        img = rearrange(frame, 'c h w -> h w c').cpu().numpy().astype(np.uint8)
+        img = HWC3(img)
+        if apply_depth_detect:
+            detected_map, _ = apply_midas(img)
+        else:
+            detected_map = img
+        detected_map = HWC3(detected_map)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_maps.append(detected_map[None])
+    detected_maps = np.concatenate(detected_maps)
+    control = torch.from_numpy(detected_maps.copy()).float() / 255.0
+    return rearrange(control, 'f h w c -> f c h w')
 def pre_process_pose(input_video, apply_pose_detect: bool = True):
     detected_maps = []
     for frame in input_video:
     _, h, w, _ = video.shape
     video = rearrange(video, "f h w c -> f c h w")
     video = torch.Tensor(video).to(device).to(dtype)
+    # Use max if you want the larger side to be equal to resolution (e.g. 512)
+    # k = float(resolution) / min(h, w)
+    k = float(resolution) / max(h, w)
+    h *= k
+    w *= k
+    h = int(np.round(h / 64.0)) * 64
+    w = int(np.round(w / 64.0)) * 64
     video = Resize((h, w), interpolation=InterpolationMode.BILINEAR, antialias=True)(video)
     if normalize:
         video = video / 127.5 - 1.0