solo-depth-any-video

Sleeping

App Files Files Community

depthanyvideo commited on Oct 19

Commit

0297809

•

1 Parent(s): 47ac829

update

Browse files

Files changed (2) hide show

app.py +176 -143
dav/utils/img_utils.py +27 -20

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
-import gradio as gr
-import logging
 import os
 import random
 import tempfile
 import time
-import spaces
 from easydict import EasyDict
 import numpy as np
 import torch
@@ -24,11 +25,11 @@ def seed_all(seed: int = 0):
     torch.cuda.manual_seed_all(seed)
-# Initialize logging
-logging.basicConfig(level=logging.INFO)
-# Load models once to avoid reloading on every inference
 def load_models(model_base, device):
     vae = AutoencoderKLTemporalDecoder.from_pretrained(model_base, subfolder="vae")
     scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
@@ -50,146 +51,178 @@ def load_models(model_base, device):
     return pipe
-# Load models at startup
-MODEL_BASE = "hhyangcs/depth-any-video"
-DEVICE_TYPE = "cuda"
-DEVICE = torch.device(DEVICE_TYPE)
-pipe = load_models(MODEL_BASE, DEVICE)
 @spaces.GPU(duration=140)
-def depth_any_video(
-    file,
-    denoise_steps=3,
-    num_frames=32,
-    decode_chunk_size=16,
-    num_interp_frames=16,
-    num_overlap_frames=6,
-    max_resolution=1024,
 ):
-    """
-    Perform depth estimation on the uploaded video/image.
-    """
-    with open(file, "rb") as _file:
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # Save the uploaded file
-            input_path = os.path.join(tmp_dir, file.name)
-            with open(input_path, "wb") as f:
-                f.write(_file.read())
-            # Set up output directory
-            output_dir = os.path.join(tmp_dir, "output")
-            os.makedirs(output_dir, exist_ok=True)
-            # Prepare configuration
-            cfg = EasyDict(
-                {
-                    "model_base": MODEL_BASE,
-                    "data_path": input_path,
-                    "output_dir": output_dir,
-                    "denoise_steps": denoise_steps,
-                    "num_frames": num_frames,
-                    "decode_chunk_size": decode_chunk_size,
-                    "num_interp_frames": num_interp_frames,
-                    "num_overlap_frames": num_overlap_frames,
-                    "max_resolution": max_resolution,
-                    "seed": 666,
-                }
-            )
-            seed_all(cfg.seed)
-            file_name = os.path.splitext(os.path.basename(cfg.data_path))[0]
-            is_video = cfg.data_path.lower().endswith((".mp4", ".avi", ".mov", ".mkv"))
-            if is_video:
-                num_interp_frames = cfg.num_interp_frames
-                num_overlap_frames = cfg.num_overlap_frames
-                num_frames = cfg.num_frames
-                assert num_frames % 2 == 0, "num_frames should be even."
-                assert (
-                    2 <= num_overlap_frames <= (num_interp_frames + 2 + 1) // 2
-                ), "Invalid frame overlap."
-                max_frames = (num_interp_frames + 2 - num_overlap_frames) * (
-                    num_frames // 2
-                )
-                image, fps = img_utils.read_video(cfg.data_path, max_frames=max_frames)
-            else:
-                image = img_utils.read_image(cfg.data_path)
-            image = img_utils.imresize_max(image, cfg.max_resolution)
-            image = img_utils.imcrop_multi(image)
-            image_tensor = np.ascontiguousarray(
-                [_img.transpose(2, 0, 1) / 255.0 for _img in image]
-            )
-            image_tensor = torch.from_numpy(image_tensor).to(DEVICE)
-            with torch.no_grad(), torch.autocast(
-                device_type=DEVICE_TYPE, dtype=torch.float16
-            ):
-                pipe_out = pipe(
-                    image_tensor,
-                    num_frames=cfg.num_frames,
-                    num_overlap_frames=cfg.num_overlap_frames,
-                    num_interp_frames=cfg.num_interp_frames,
-                    decode_chunk_size=cfg.decode_chunk_size,
-                    num_inference_steps=cfg.denoise_steps,
-                )
-            disparity = pipe_out.disparity
-            disparity_colored = pipe_out.disparity_colored
-            image = pipe_out.image
-            # (N, H, 2 * W, 3)
-            merged = np.concatenate(
-                [
-                    image,
-                    disparity_colored,
-                ],
-                axis=2,
-            )
-            if is_video:
-                output_path = os.path.join(cfg.output_dir, f"{file_name}_depth.mp4")
-                img_utils.write_video(
-                    output_path,
-                    merged,
-                    fps,
-                )
-                return output_path
-            else:
-                output_path = os.path.join(cfg.output_dir, f"{file_name}_depth.png")
-                img_utils.write_image(
-                    output_path,
-                    merged[0],
-                )
-                return output_path
-# Define Gradio interface
-title = "Depth Any Video with Scalable Synthetic Data"
-description = """
-Upload a video or image to perform depth estimation using the Depth Any Video model.
-Adjust the parameters as needed to control the inference process.
-"""
-iface = gr.Interface(
-    fn=depth_any_video,
-    inputs=[
-        gr.File(label="Upload Video/Image"),
-        gr.Slider(1, 10, step=1, value=3, label="Denoise Steps"),
-        gr.Slider(16, 64, step=1, value=32, label="Number of Frames"),
-        gr.Slider(8, 32, step=1, value=16, label="Decode Chunk Size"),
-        gr.Slider(8, 32, step=1, value=16, label="Number of Interpolation Frames"),
-        gr.Slider(2, 10, step=1, value=6, label="Number of Overlap Frames"),
-        gr.Slider(512, 2048, step=32, value=1024, label="Maximum Resolution"),
-    ],
-    outputs=gr.Video(label="Depth Enhanced Video/Image"),
-    title=title,
-    description=description,
-    examples=[["demos/arch_2.jpg"], ["demos/wooly_mammoth.mp4"]],
-    allow_flagging="never",
-    analytics_enabled=False,
-)
 if __name__ == "__main__":
-    iface.launch(share=True)

+import gc
 import os
+import spaces
+import gradio as gr
 import random
 import tempfile
 import time
 from easydict import EasyDict
 import numpy as np
 import torch
     torch.cuda.manual_seed_all(seed)
+examples = [
+    ["demos/wooly_mammoth.mp4", 3, 32, 16, 16, 6, 960],
+]
 def load_models(model_base, device):
     vae = AutoencoderKLTemporalDecoder.from_pretrained(model_base, subfolder="vae")
     scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
     return pipe
+model_base = "hhyangcs/depth-any-video"
+device_type = "cuda"
+device = torch.device(device_type)
+pipe = load_models(model_base, device)
 @spaces.GPU(duration=140)
+def infer_depth(
+    file: str,
+    denoise_steps: int = 3,
+    num_frames: int = 32,
+    decode_chunk_size: int = 16,
+    num_interp_frames: int = 16,
+    num_overlap_frames: int = 6,
+    max_resolution: int = 1024,
+    seed: int = 66,
+    output_dir: str = "./outputs",
 ):
+    seed_all(seed)
+    max_frames = (num_interp_frames + 2 - num_overlap_frames) * (num_frames // 2)
+    image, fps = img_utils.read_video(file, max_frames=max_frames)
+    image = img_utils.imresize_max(image, max_resolution)
+    image = img_utils.imcrop_multi(image)
+    image_tensor = np.ascontiguousarray(
+        [_img.transpose(2, 0, 1) / 255.0 for _img in image]
+    )
+    image_tensor = torch.from_numpy(image_tensor).to(device)
+    print(f"==> video name: {file}, frames shape: {image_tensor.shape}")
+    with torch.no_grad(), torch.autocast(device_type=device_type, dtype=torch.float16):
+        pipe_out = pipe(
+            image_tensor,
+            num_frames=num_frames,
+            num_overlap_frames=num_overlap_frames,
+            num_interp_frames=num_interp_frames,
+            decode_chunk_size=decode_chunk_size,
+            num_inference_steps=denoise_steps,
+        )
+    disparity = pipe_out.disparity
+    disparity_colored = pipe_out.disparity_colored
+    image = pipe_out.image
+    # (N, H, 2 * W, 3)
+    merged = np.concatenate(
+        [
+            image,
+            disparity_colored,
+        ],
+        axis=2,
+    )
+    file_name = os.path.splitext(os.path.basename(file))[0]
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, f"{file_name}_depth.mp4")
+    img_utils.write_video(
+        output_path,
+        merged,
+        fps,
+    )
+    # clear the cache for the next video
+    gc.collect()
+    torch.cuda.empty_cache()
+    return output_path
+def construct_demo():
+    with gr.Blocks(analytics_enabled=False) as depthanyvideo_iface:
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                input_video = gr.Video(label="Input Video")
+            with gr.Column(scale=1):
+                with gr.Row(equal_height=True):
+                    output_video = gr.Video(
+                        label="Ouput Video Depth",
+                        interactive=False,
+                        autoplay=True,
+                        loop=True,
+                        show_share_button=True,
+                        scale=1,
+                    )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                with gr.Row(equal_height=False):
+                    with gr.Accordion("Advanced Settings", open=False):
+                        denoise_steps = gr.Slider(
+                            label="Denoise Steps",
+                            minimum=1,
+                            maximum=10,
+                            value=3,
+                            step=1,
+                        )
+                        num_frames = gr.Slider(
+                            label="Number of Key Frames",
+                            minimum=16,
+                            maximum=32,
+                            value=24,
+                            step=2,
+                        )
+                        decode_chunk_size = gr.Slider(
+                            label="Decode Chunk Size",
+                            minimum=8,
+                            maximum=32,
+                            value=16,
+                            step=1,
+                        )
+                        num_interp_frames = gr.Slider(
+                            label="Number of Interpolation Frames",
+                            minimum=8,
+                            maximum=32,
+                            value=16,
+                            step=1,
+                        )
+                        num_overlap_frames = gr.Slider(
+                            label="Number of Overlap Frames",
+                            minimum=2,
+                            maximum=10,
+                            value=6,
+                            step=1,
+                        )
+                        max_resolution = gr.Slider(
+                            label="Maximum Resolution",
+                            minimum=512,
+                            maximum=2048,
+                            value=1024,
+                            step=32,
+                        )
+                    generate_btn = gr.Button("Generate")
+            with gr.Column(scale=2):
+                pass
+        gr.Examples(
+            examples=examples,
+            inputs=[
+                input_video,
+                denoise_steps,
+                num_frames,
+                decode_chunk_size,
+                num_interp_frames,
+                num_overlap_frames,
+                max_resolution,
+            ],
+            outputs=output_video,
+            fn=infer_depth,
+            cache_examples="lazy",
+        )
+        generate_btn.click(
+            fn=infer_depth,
+            inputs=[
+                input_video,
+                denoise_steps,
+                num_frames,
+                decode_chunk_size,
+                num_interp_frames,
+                num_overlap_frames,
+                max_resolution,
+            ],
+            outputs=output_video,
+        )
+    return depthanyvideo_iface
+demo = construct_demo()
 if __name__ == "__main__":
+    demo.queue()
+    demo.launch(share=True)

dav/utils/img_utils.py CHANGED Viewed

@@ -85,26 +85,33 @@ def read_image(image_path):
 def write_video(video_path, frames, fps):
-    tmp_dir = os.path.join(os.path.dirname(video_path), "tmp")
-    os.makedirs(tmp_dir, exist_ok=True)
-    for i, frame in enumerate(frames):
-        write_image(os.path.join(tmp_dir, f"{i:06d}.png"), frame)
-    # it will cause visual compression artifacts
-    ffmpeg_command = [
-        "ffmpeg",
-        "-f",
-        "image2",
-        "-framerate",
-        f"{fps}",
-        "-i",
-        os.path.join(tmp_dir, "%06d.png"),
-        "-b:v",
-        "5626k",
-        "-y",
-        video_path,
-    ]
-    os.system(" ".join(ffmpeg_command))
-    os.system(f"rm -rf {tmp_dir}")
 def write_image(image_path, frame):

 def write_video(video_path, frames, fps):
+    # tmp_dir = os.path.join(os.path.dirname(video_path), "tmp")
+    # os.makedirs(tmp_dir, exist_ok=True)
+    # for i, frame in enumerate(frames):
+    #     write_image(os.path.join(tmp_dir, f"{i:06d}.png"), frame)
+    # # it will cause visual compression artifacts
+    # ffmpeg_command = [
+    #     "ffmpeg",
+    #     "-f",
+    #     "image2",
+    #     "-framerate",
+    #     f"{fps}",
+    #     "-i",
+    #     os.path.join(tmp_dir, "%06d.png"),
+    #     "-b:v",
+    #     "5626k",
+    #     "-y",
+    #     video_path,
+    # ]
+    # os.system(" ".join(ffmpeg_command))
+    # os.system(f"rm -rf {tmp_dir}")
+    h, w = frames[0].shape[:2]
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = cv2.VideoWriter(video_path, fourcc, fps, (w, h))
+    for frame in frames:
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        out.write(frame)
+    out.release()
 def write_image(image_path, frame):