NaRCan_demo

Sleeping

File size: 13,630 Bytes

import gradio as gr
import spaces
import numpy as np
import torch
torch.jit.script = lambda f: f
import cv2
import os
import imageio
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from controlnet_aux import LineartDetector
from functools import partial
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, ToTensor, Normalize, Resize

from NaRCan_model import Homography, Siren
from util import get_mgrid, apply_homography, jacobian, VideoFitting, TestVideoFitting


device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_example():
    case = [
        [
            'examples/bear.mp4',     
        ],
        [
            'examples/boat.mp4',     
        ],
        [
            'examples/woman-drink.mp4',     
        ],
        [
            'examples/corgi.mp4',     
        ],
        [
            'examples/yacht.mp4',     
        ],
        [
            'examples/koolshooters.mp4',     
        ],
        [
            'examples/overlook-the-ocean.mp4',     
        ],
        [
            'examples/rotate.mp4',
        ],
        [
            'examples/shark-ocean.mp4',     
        ],
        [
            'examples/surf.mp4',     
        ],
        [
            'examples/cactus.mp4',     
        ],
        [
            'examples/gold-fish.mp4',
        ]
    ]
    return case


def set_default_prompt(video_name):
    video_to_prompt = {
        'bear.mp4': 'bear, Van Gogh Style',
        'boat.mp4': 'a burning boat sails on lava',
        'cactus.mp4': 'cactus, made of paper',
        'corgi.mp4': 'a hellhound',
        'gold-fish.mp4': 'Goldfish in the Milky Way',
        'koolshooters.mp4': 'Avatar',
        'overlook-the-ocean.mp4': 'ocean, pixel style',
        'rotate.mp4': 'turbine engine',
        'shark-ocean.mp4': 'A sleek shark, cartoon style',
        'surf.mp4': 'Sailing, The background is a large white cloud, sketch style',
        'woman-drink.mp4': 'a drinking zombie',
        'yacht.mp4': 'yacht, cyberpunk style',
    }
    return video_to_prompt.get(video_name, '')


def update_prompt(input_video):
    video_name = input_video.split('/')[-1]
    return set_default_prompt(video_name)


# Map videos to corresponding images
video_to_image = {
    'bear.mp4': ['canonical/bear.png', 'pth_file/bear', 'examples_frames/bear'],
    'boat.mp4': ['canonical/boat.png', 'pth_file/boat', 'examples_frames/boat'],
    'cactus.mp4': ['canonical/cactus.png', 'pth_file/cactus', 'examples_frames/cactus'],
    'corgi.mp4': ['canonical/corgi.png', 'pth_file/corgi', 'examples_frames/corgi'],
    'gold-fish.mp4': ['canonical/gold-fish.png', 'pth_file/gold-fish', 'examples_frames/gold-fish'],
    'koolshooters.mp4': ['canonical/koolshooters.png', 'pth_file/koolshooters', 'examples_frames/koolshooters'],
    'overlook-the-ocean.mp4': ['canonical/overlook-the-ocean.png', 'pth_file/overlook-the-ocean', 'examples_frames/overlook-the-ocean'],
    'rotate.mp4': ['canonical/rotate.png', 'pth_file/rotate', 'examples_frames/rotate'],
    'shark-ocean.mp4': ['canonical/shark-ocean.png', 'pth_file/shark-ocean', 'examples_frames/shark-ocean'],
    'surf.mp4': ['canonical/surf.png', 'pth_file/surf', 'examples_frames/surf'],
    'woman-drink.mp4': ['canonical/woman-drink.png', 'pth_file/woman-drink', 'examples_frames/woman-drink'],
    'yacht.mp4': ['canonical/yacht.png', 'pth_file/yacht', 'examples_frames/yacht'],
}


def images_to_video(image_list, output_path, fps=10):
    # Convert PIL Images to numpy arrays
    frames = [np.array(img).astype(np.uint8) for img in image_list]
    frames = frames[:20]

    # Create video writer
    writer = imageio.get_writer(output_path, fps=fps, codec='libx264')

    for frame in frames:
        writer.append_data(frame)

    writer.close()


@spaces.GPU(duration=120)
def NaRCan_make_video(edit_canonical, pth_path, frames_path):
    # load NaRCan model
    checkpoint_g_old = torch.load(os.path.join(pth_path, "homography_g.pth"))
    checkpoint_g = torch.load(os.path.join(pth_path, "mlp_g.pth"))
    g_old = Homography(hidden_features=256, hidden_layers=2).to(device)
    g = Siren(in_features=3, out_features=2, hidden_features=256,
              hidden_layers=5, outermost_linear=True).to(device)
    
    g_old.load_state_dict(checkpoint_g_old)
    g.load_state_dict(checkpoint_g)

    g_old.eval()
    g.eval()

    transform = Compose([
        Resize(512),
        ToTensor(),
        Normalize(torch.Tensor([0.5, 0.5, 0.5]), torch.Tensor([0.5, 0.5, 0.5]))
    ])
    v = TestVideoFitting(frames_path, transform)
    videoloader = DataLoader(v, batch_size=1, pin_memory=True, num_workers=0)

    model_input, ground_truth = next(iter(videoloader))
    model_input, ground_truth = model_input[0].to(device), ground_truth[0].to(device)

    myoutput = None
    data_len = len(os.listdir(frames_path))

    with torch.no_grad():
        batch_size = (v.H * v.W)
        for step in range(data_len):
            start = (step * batch_size) % len(model_input)
            end = min(start + batch_size, len(model_input))

            # get the deformation
            xy, t = model_input[start:end, :-1], model_input[start:end, [-1]]
            xyt = model_input[start:end]
            h_old = apply_homography(xy, g_old(t))
            h = g(xyt)
            xy_ = h_old + h

            # use canonical to reconstruct
            w, h = v.W, v.H
            canonical_img = np.array(edit_canonical.convert('RGB'))
            canonical_img = torch.from_numpy(canonical_img).float().to(device)
            h_c, w_c = canonical_img.shape[:2]
            grid_new = xy_.clone()
            grid_new[..., 1] = xy_[..., 0] / 1.5
            grid_new[..., 0] = xy_[..., 1] / 2.0

            if len(canonical_img.shape) == 3:
                canonical_img = canonical_img.unsqueeze(0)
            results = torch.nn.functional.grid_sample(
                canonical_img.permute(0, 3, 1, 2),
                grid_new.unsqueeze(1).unsqueeze(0),
                mode='bilinear',
                padding_mode='border')
            o = results.squeeze().permute(1,0)

            if step == 0:
                myoutput = o
            
            else:
                myoutput = torch.cat([myoutput, o])

    myoutput = myoutput.reshape(512, 512, data_len, 3).permute(2, 0, 1, 3).clone().detach().cpu().numpy().astype(np.float32)
    # myoutput = np.clip(myoutput, -1, 1) * 0.5 + 0.5

    for i in range(len(myoutput)):
        myoutput[i] = Image.fromarray(np.uint8(myoutput[i])).resize((512, 512)) #854, 480

    edit_video_path = f'NaRCan_fps_10.mp4'
    images_to_video(myoutput, edit_video_path)
    
    return edit_video_path


@spaces.GPU(duration=120)
def edit_with_pnp(input_video, prompt, num_steps, guidance_scale, seed, n_prompt, control_type="Lineart"):
    video_name = input_video.split('/')[-1]
    if video_name in video_to_image:
        image_path = video_to_image[video_name][0]
        pth_path = video_to_image[video_name][1]
        frames_path = video_to_image[video_name][2]
    else:
        return None

    if control_type == "Lineart":
        # Load the control net model for lineart
        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_lineart", torch_dtype=torch.float16)
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
        )
        pipe.to(device)
        # lineart
        processor = LineartDetector.from_pretrained("lllyasviel/Annotators")
        processor_partial = partial(processor, coarse=False)
        size_ = 768
        canonical_image = Image.open(image_path)
        ori_size = canonical_image.size
        image = processor_partial(canonical_image.resize((size_, size_)), detect_resolution=size_, image_resolution=size_)
        image = image.resize(ori_size, resample=Image.BILINEAR)
        image.save("control_map.png")
        
        generator = torch.manual_seed(seed) if seed != -1 else None
        output_images = pipe(
            prompt=prompt,
            image=image,
            num_inference_steps=num_steps,
            guidance_scale=guidance_scale,
            negative_prompt=n_prompt,
            generator=generator
        ).images

        output_images[0].save("edited_canonical_image.png")
        # output_images[0] = output_images[0].resize(ori_size, resample=Image.BILINEAR)
    
    else:
        # Load the control net model for canny
        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16)
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
        )
        pipe.to(device)
        # canny
        canonical_image = cv2.imread(image_path)
        canonical_image = cv2.cvtColor(canonical_image, cv2.COLOR_BGR2RGB)
        image = cv2.cvtColor(canonical_image, cv2.COLOR_RGB2GRAY)
        image = cv2.Canny(image, 100, 200)
        image = image[:, :, None]
        image = np.concatenate([image, image, image], axis=2)
        image = Image.fromarray(image)
        image.save("control_map.png")
        
        generator = torch.manual_seed(seed) if seed != -1 else None
        output_images = pipe(
            prompt=prompt,
            image=image,
            num_inference_steps=num_steps,
            guidance_scale=guidance_scale,
            negative_prompt=n_prompt,
            generator=generator
        ).images

        output_images[0].save("edited_canonical_image.png")
    
    edit_video_path = NaRCan_make_video(output_images[0], pth_path, frames_path)
    edit_image_path = [
        (image_path, "canonical image"), 
        ("control_map.png", "control map"),
        ("edited_canonical_image.png", "edited canonical image")
        ]

    # Here we return the first output image as the result
    return edit_video_path, edit_image_path


########
# demo #
########


intro = """
<div style="text-align:center">
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;">
   NaRCan - <small>Natural Refined Canonical Image</small>
</h1>
<span>[<a target="_blank" href="https://koi953215.github.io/NaRCan_page/">Project page</a>], [<a target="_blank" href="https://huggingface.co/papers/2406.06523">Paper</a>]</span>
<div style="display:flex; justify-content: center;margin-top: 0.5em">Try selecting different control types (Canny or Lineart) in Advanced options!</div>
</div>
"""



with gr.Blocks(css="style.css") as demo:
    
    gr.HTML(intro)
    frames = gr.State()
    inverted_latents = gr.State()
    latents = gr.State()
    zs = gr.State()
    do_inversion = gr.State(value=True)

    with gr.Row():
        input_video = gr.Video(label="Input Video", interactive=False, elem_id="input_video", value='examples/bear.mp4', height=365, width=365)
        output_video = gr.Video(label="Edited Video", interactive=False, elem_id="output_video", height=365, width=365)
        # input_video.style(height=365, width=365)
        # output_video.style(height=365, width=365)


    with gr.Row():
            prompt = gr.Textbox(
                            label="Describe your edited video",
                            max_lines=1, 
                            value="bear, Van Gogh Style"
                            # placeholder="bear, Van Gogh Style"
                        )
    

    with gr.Row():
        canonical_result = gr.Gallery(label="Edited Canonical Image", columns=3)

               
    with gr.Row():
        run_button = gr.Button("Edit your video!", visible=True)

    max_images = 12
    default_num_images = 3
    with gr.Accordion('Advanced options', open=False):
        control_type = gr.Dropdown(
            ["Canny", "Lineart"], 
            label="Control Type", 
            info="Canny or Lineart",
            value="Lineart"
        )
        num_steps = gr.Slider(label='Steps',
                                minimum=1,
                                maximum=100,
                                value=20,
                                step=1)
        guidance_scale = gr.Slider(label='Guidance Scale',
                                    minimum=0.1,
                                    maximum=30.0,
                                    value=9.0,
                                    step=0.1)
        seed = gr.Slider(label='Seed',
                            minimum=-1,
                            maximum=2147483647,
                            step=1,
                            randomize=True)
        n_prompt = gr.Textbox(
            label='Negative Prompt',
            value=""
        )
                    
    input_video.change(
        fn = update_prompt,
        inputs = [input_video],
        outputs = [prompt],
        queue = False)
    
    run_button.click(fn = edit_with_pnp,
                     inputs = [input_video, 
                               prompt, 
                               num_steps, 
                               guidance_scale, 
                               seed, 
                               n_prompt,
                               control_type,
                               ],
                                 outputs = [output_video, canonical_result]
                                )

    gr.Examples(
        examples=get_example(),
        label='Examples',
        inputs=[input_video],
        outputs=[output_video],
        examples_per_page=8
    )

demo.queue()

demo.launch()