Spaces:

heheyas
/

V3D

Runtime error

V3D

File size: 8,505 Bytes

# TODO
import numpy as np
import argparse
import torch
from torchvision.utils import make_grid
import tempfile
import gradio as gr
from omegaconf import OmegaConf
from einops import rearrange
from scripts.pub.V3D_512 import (
    sample_one,
    get_batch,
    get_unique_embedder_keys_from_conditioner,
    load_model,
)
from sgm.util import default, instantiate_from_config
from safetensors.torch import load_file as load_safetensors
from PIL import Image
from kiui.op import recenter
from torchvision.transforms import ToTensor
from einops import rearrange, repeat
import rembg
import os
from glob import glob
from mediapy import write_video
from pathlib import Path
import spaces
from huggingface_hub import hf_hub_download
import imageio

import cv2


@spaces.GPU
def do_sample(
    image,
    num_frames,
    num_steps,
    decoding_t,
    border_ratio,
    ignore_alpha,
    output_folder,
    seed,
):
    # if image.mode == "RGBA":
    #     image = image.convert("RGB")
    torch.manual_seed(seed)
    image = Image.fromarray(image)
    w, h = image.size

    if border_ratio > 0:
        if image.mode != "RGBA" or ignore_alpha:
            image = image.convert("RGB")
            image = np.asarray(image)
            carved_image = rembg.remove(image, session=rembg_session)  # [H, W, 4]
        else:
            image = np.asarray(image)
            carved_image = image
        mask = carved_image[..., -1] > 0
        image = recenter(carved_image, mask, border_ratio=border_ratio)
        image = image.astype(np.float32) / 255.0
        if image.shape[-1] == 4:
            image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
        image = Image.fromarray((image * 255).astype(np.uint8))
    else:
        print("Ignore border ratio")
    image = image.resize((512, 512))

    image = ToTensor()(image)
    image = image * 2.0 - 1.0

    image = image.unsqueeze(0).to(device)
    H, W = image.shape[2:]
    assert image.shape[1] == 3
    F = 8
    C = 4
    shape = (num_frames, C, H // F, W // F)

    value_dict = {}
    value_dict["motion_bucket_id"] = 0
    value_dict["fps_id"] = 0
    value_dict["cond_aug"] = 0.05
    value_dict["cond_frames_without_noise"] = clip_model(image)
    value_dict["cond_frames"] = ae_model.encode(image)
    value_dict["cond_frames"] += 0.05 * torch.randn_like(value_dict["cond_frames"])
    value_dict["cond_aug"] = 0.05

    print(device)
    with torch.no_grad():
        with torch.autocast(device_type="cuda"):
            batch, batch_uc = get_batch(
                get_unique_embedder_keys_from_conditioner(model.conditioner),
                value_dict,
                [1, num_frames],
                T=num_frames,
                device=device,
            )
            c, uc = model.conditioner.get_unconditional_conditioning(
                batch,
                batch_uc=batch_uc,
                force_uc_zero_embeddings=[
                    "cond_frames",
                    "cond_frames_without_noise",
                ],
            )

            for k in ["crossattn", "concat"]:
                uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
                uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
                c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
                c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

            randn = torch.randn(shape, device=device)
            randn = randn.to(device)

            additional_model_inputs = {}
            additional_model_inputs["image_only_indicator"] = torch.zeros(
                2, num_frames
            ).to(device)
            additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

            def denoiser(input, sigma, c):
                return model.denoiser(
                    model.model, input, sigma, c, **additional_model_inputs
                )

            samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
            model.en_and_decode_n_samples_a_time = decoding_t
            samples_x = model.decode_first_stage(samples_z)
            samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)

            os.makedirs(output_folder, exist_ok=True)
            base_count = len(glob(os.path.join(output_folder, "*.mp4")))
            video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

            frames = (
                (rearrange(samples, "t c h w -> t h w c") * 255)
                .cpu()
                .numpy()
                .astype(np.uint8)
            )
            # write_video(video_path, frames, fps=6)
            # writer = cv2.VideoWriter(
            #     video_path,
            #     cv2.VideoWriter_fourcc("m", "p", "4", "v"),
            #     6,
            #     (frames.shape[-1], frames.shape[-2]),
            # )
            # for fr in frames:
            #     writer.write(cv2.cvtColor(fr, cv2.COLOR_RGB2BGR))
            # writer.release()
            imageio.mimwrite(video_path, frames, fps=6)

    return video_path


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# download
V3D_ckpt_path = hf_hub_download(repo_id="heheyas/V3D", filename="V3D.ckpt")
svd_xt_ckpt_path = hf_hub_download(
    repo_id="stabilityai/stable-video-diffusion-img2vid-xt",
    filename="svd_xt.safetensors",
)

model_config = "./scripts/pub/configs/V3D_512.yaml"
num_frames = OmegaConf.load(
    model_config
).model.params.sampler_config.params.guider_config.params.num_frames
print("Detected num_frames:", num_frames)
# num_steps = default(num_steps, 25)
num_steps = 25
output_folder = "outputs/V3D_512"

sd = load_safetensors(svd_xt_ckpt_path)
clip_model_config = OmegaConf.load("./configs/embedder/clip_image.yaml")
clip_model = instantiate_from_config(clip_model_config).eval()
clip_sd = dict()
for k, v in sd.items():
    if "conditioner.embedders.0" in k:
        clip_sd[k.replace("conditioner.embedders.0.", "")] = v
clip_model.load_state_dict(clip_sd)
clip_model = clip_model.to(device)

ae_model_config = OmegaConf.load("./configs/ae/video.yaml")
ae_model = instantiate_from_config(ae_model_config).eval()
encoder_sd = dict()
for k, v in sd.items():
    if "first_stage_model" in k:
        encoder_sd[k.replace("first_stage_model.", "")] = v
ae_model.load_state_dict(encoder_sd)
ae_model = ae_model.to(device)
rembg_session = rembg.new_session()

model, _ = load_model(
    model_config,
    device,
    num_frames,
    num_steps,
    min_cfg=3.5,
    max_cfg=3.5,
    ckpt_path=V3D_ckpt_path,
)
model = model.to(device)

with gr.Blocks(title="V3D", theme=gr.themes.Monochrome()) as demo:
    with gr.Row(equal_height=True):
        with gr.Column():
            input_image = gr.Image(value=None, label="Input Image")

            border_ratio_slider = gr.Slider(
                value=0.3,
                label="Border Ratio",
                minimum=0.05,
                maximum=0.5,
                step=0.05,
            )
            seed_input = gr.Number(value=42)
            decoding_t_slider = gr.Slider(
                value=1,
                label="Number of Decoding frames",
                minimum=1,
                maximum=num_frames,
                step=1,
            )
            min_guidance_slider = gr.Slider(
                value=3.5,
                label="Min CFG Value",
                minimum=0.05,
                maximum=5,
                step=0.05,
            )
            max_guidance_slider = gr.Slider(
                value=3.5,
                label="Max CFG Value",
                minimum=0.05,
                maximum=5,
                step=0.05,
            )
            run_button = gr.Button(value="Run V3D")

        with gr.Column():
            output_video = gr.Video(value=None, label="Output Orbit Video")

    @run_button.click(
        inputs=[
            input_image,
            border_ratio_slider,
            min_guidance_slider,
            max_guidance_slider,
            decoding_t_slider,
            seed_input,
        ],
        outputs=[output_video],
    )
    def _(image, border_ratio, min_guidance, max_guidance, decoding_t, seed):
        model.sampler.guider.max_scale = max_guidance
        model.sampler.guider.min_scale = min_guidance
        return do_sample(
            image,
            num_frames,
            num_steps,
            int(decoding_t),
            border_ratio,
            False,
            output_folder,
            seed,
        )


demo.launch()