Spaces:

Koi953215
/

NaRCan_demo

Running on Zero

App Files Files Community

Koi953215 commited on Jun 19, 2024

Commit

f9cbc98

1 Parent(s): e1c55e6

init commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

NaRCan_model.py +221 -0
app.py +362 -0
canonical/bear.png +0 -0
canonical/boat.png +0 -0
canonical/cactus.png +0 -0
canonical/corgi.png +0 -0
canonical/gold-fish.png +0 -0
canonical/koolshooters.png +0 -0
canonical/overlook-the-ocean.png +0 -0
canonical/rotate.png +0 -0
canonical/shark-ocean.png +0 -0
canonical/surf.png +0 -0
canonical/woman-drink.png +0 -0
canonical/yacht.png +0 -0
examples/bear.mp4 +0 -0
examples/boat.mp4 +0 -0
examples/cactus.mp4 +0 -0
examples/corgi.mp4 +0 -0
examples/gold-fish.mp4 +0 -0
examples/koolshooters.mp4 +0 -0
examples/overlook-the-ocean.mp4 +0 -0
examples/rotate.mp4 +0 -0
examples/shark-ocean.mp4 +0 -0
examples/surf.mp4 +0 -0
examples/woman-drink.mp4 +0 -0
examples/yacht.mp4 +0 -0
examples_frames/bear/00040.jpg +0 -0
examples_frames/bear/00041.jpg +0 -0
examples_frames/bear/00042.jpg +0 -0
examples_frames/bear/00043.jpg +0 -0
examples_frames/bear/00044.jpg +0 -0
examples_frames/bear/00045.jpg +0 -0
examples_frames/bear/00046.jpg +0 -0
examples_frames/bear/00047.jpg +0 -0
examples_frames/bear/00048.jpg +0 -0
examples_frames/bear/00049.jpg +0 -0
examples_frames/bear/00050.jpg +0 -0
examples_frames/bear/00051.jpg +0 -0
examples_frames/bear/00052.jpg +0 -0
examples_frames/bear/00053.jpg +0 -0
examples_frames/bear/00054.jpg +0 -0
examples_frames/bear/00055.jpg +0 -0
examples_frames/bear/00056.jpg +0 -0
examples_frames/bear/00057.jpg +0 -0
examples_frames/bear/00058.jpg +0 -0
examples_frames/bear/00059.jpg +0 -0
examples_frames/bear/00060.jpg +0 -0
examples_frames/bear/00061.jpg +0 -0
examples_frames/bear/00062.jpg +0 -0
examples_frames/bear/00063.jpg +0 -0

NaRCan_model.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import torch
+from torch import nn
+import numpy as np
+import math
+# import tinycudann as tcnn
+class SineLayer(nn.Module):
+    # See paper sec. 3.2, final paragraph, and supplement Sec. 1.5 for discussion of omega_0.
+    # If is_first=True, omega_0 is a frequency factor which simply multiplies the activations before the
+    # nonlinearity. Different signals may require different omega_0 in the first layer - this is a
+    # hyperparameter.
+    # If is_first=False, then the weights will be divided by omega_0 so as to keep the magnitude of
+    # activations constant, but boost gradients to the weight matrix (see supplement Sec. 1.5)
+    def __init__(self, in_features, out_features, bias=True,
+                 is_first=False, omega_0=30):
+        super().__init__()
+        self.omega_0 = omega_0
+        self.is_first = is_first
+        self.in_features = in_features
+        self.linear = nn.Linear(in_features, out_features, bias=bias)
+        self.init_weights()
+    def init_weights(self):
+        with torch.no_grad():
+            if self.is_first:
+                self.linear.weight.uniform_(-1 / self.in_features,
+                                             1 / self.in_features)
+            else:
+                self.linear.weight.uniform_(-np.sqrt(6 / self.in_features) / self.omega_0,
+                                             np.sqrt(6 / self.in_features) / self.omega_0)
+    def forward(self, input):
+        return torch.sin(self.omega_0 * self.linear(input))
+    def forward_with_intermediate(self, input):
+        # For visualization of activation distributions
+        intermediate = self.omega_0 * self.linear(input)
+        return torch.sin(intermediate), intermediate
+class Siren(nn.Module):
+    def __init__(self, in_features, hidden_features, hidden_layers, out_features, outermost_linear=False,
+                 first_omega_0=30, hidden_omega_0=30.):
+        super().__init__()
+        self.net = []
+        self.net.append(SineLayer(in_features, hidden_features,
+                                  is_first=True, omega_0=first_omega_0))
+        for i in range(hidden_layers):
+            self.net.append(SineLayer(hidden_features, hidden_features,
+                                      is_first=False, omega_0=hidden_omega_0))
+        if outermost_linear:
+            final_linear = nn.Linear(hidden_features, out_features)
+            with torch.no_grad():
+                final_linear.weight.uniform_(-np.sqrt(6 / hidden_features) / hidden_omega_0,
+                                              np.sqrt(6 / hidden_features) / hidden_omega_0)
+            self.net.append(final_linear)
+        else:
+            self.net.append(SineLayer(hidden_features, out_features,
+                                      is_first=False, omega_0=hidden_omega_0))
+        self.net = nn.Sequential(*self.net)
+    def forward(self, coords):
+        output = self.net(coords)
+        return output
+class Homography(nn.Module):
+    def __init__(self, in_features=1, hidden_features=256, hidden_layers=1):
+        super().__init__()
+        out_features = 8
+        self.net = []
+        self.net.append(nn.Linear(in_features, hidden_features))
+        self.net.append(nn.ReLU(inplace=True))
+        for i in range(hidden_layers):
+            self.net.append(nn.Linear(hidden_features, hidden_features))
+            self.net.append(nn.ReLU(inplace=True))
+        self.net.append(nn.Linear(hidden_features, out_features))
+        self.net = nn.Sequential(*self.net)
+        self.init_weights()
+    def init_weights(self):
+        with torch.no_grad():
+            self.net[-1].bias.copy_(torch.Tensor([1., 0., 0., 0., 1., 0., 0., 0.]))
+    def forward(self, coords):
+        output = self.net(coords)
+        return output
+class Annealed(nn.Module):
+    def __init__(self, in_channels, annealed_step, annealed_begin_step=0, identity=True):
+        """
+        Defines a function that embeds x to (x, sin(2^k x), cos(2^k x), ...)
+        in_channels: number of input channels (3 for both xyz and direction)
+        """
+        super(Annealed, self).__init__()
+        self.N_freqs = 16
+        self.in_channels = in_channels
+        self.annealed = True
+        self.annealed_step = annealed_step
+        self.annealed_begin_step = annealed_begin_step
+        self.index = torch.linspace(0, self.N_freqs - 1, self.N_freqs)
+        self.identity = identity
+        self.index_2 = self.index.view(-1, 1).repeat(1, 2).view(-1)
+    def forward(self, x_embed, step):
+        """
+        Embeds x to (x, sin(2^k x), cos(2^k x), ...)
+        Different from the paper, "x" is also in the output
+        See https://github.com/bmild/nerf/issues/12
+        Inputs:
+            x: (B, self.in_channels)
+        Outputs:
+            out: (B, self.out_channels)
+        """
+        use_PE = False
+        if self.annealed_begin_step == 0:
+            # calculate the w for each freq bands
+            alpha = self.N_freqs * step / float(self.annealed_step)
+        else:
+            if step <= self.annealed_begin_step:
+                alpha = 0
+            else:
+                alpha = (self.N_freqs) * (step - self.annealed_begin_step) / float(
+                    self.annealed_step)
+        w = (1 - torch.cos(math.pi * torch.clamp(alpha * torch.ones_like(self.index_2) - self.index_2, 0, 1))) / 2
+        if use_PE:
+            w[16:] = w[:16]
+        out = x_embed * w.to(x_embed.device)
+        return out
+class BARF_PE(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = tcnn.Encoding(n_input_dims=2,
+                                     encoding_config=config["positional encoding"])
+        self.decoder = tcnn.Network(n_input_dims=self.encoder.n_output_dims +
+                                    2,
+                                    n_output_dims=3,
+                                    network_config=config["BARF network"])
+    def forward(self, x, step=0, aneal_func=None):
+        input = x
+        input = self.encoder(input)
+        if aneal_func is not None:
+            input = torch.cat([x, aneal_func(input,step)], dim=-1)
+        else:
+            input = torch.cat([x, input], dim=-1)
+        weight = torch.ones(input.shape[-1], device=input.device).cuda()
+        x = self.decoder(weight * input)
+        return x
+class Deform_Hash3d(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = tcnn.Encoding(n_input_dims=3,
+                                     encoding_config=config["encoding_deform3d"])
+        self.decoder = nn.Sequential(nn.Linear(self.encoder.n_output_dims + 3, 256),
+                                     nn.ReLU(),
+                                     nn.Linear(256, 256),
+                                     nn.ReLU(),
+                                     nn.Linear(256, 256),
+                                     nn.ReLU(),
+                                     nn.Linear(256, 256),
+                                     nn.ReLU(),
+                                     nn.Linear(256, 256),
+                                     nn.ReLU(),
+                                     nn.Linear(256, 256),
+                                     nn.ReLU(),
+                                     nn.Linear(256, 2)
+                                     )
+    def forward(self, x, step=0, aneal_func=None):
+        input = x
+        input = self.encoder(input)
+        if aneal_func is not None:
+            input = torch.cat([x, aneal_func(input,step)], dim=-1)
+        else:
+            input = torch.cat([x, input], dim=-1)
+        weight = torch.ones(input.shape[-1], device=input.device).cuda()
+        x = self.decoder(weight * input) / 5
+        return x
+class Deform_Hash3d_Warp(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.Deform_Hash3d = Deform_Hash3d(config)
+    def forward(self, xyt_norm, step=0,aneal_func=None):
+        x = self.Deform_Hash3d(xyt_norm,step=step, aneal_func=aneal_func)
+        return x

app.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import gradio as gr
+import numpy as np
+import torch
+import cv2
+import os
+import imageio
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+from controlnet_aux import LineartDetector
+from functools import partial
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision.transforms import Compose, ToTensor, Normalize, Resize
+from NaRCan_model import Homography, Siren
+from util import get_mgrid, apply_homography, jacobian, VideoFitting, TestVideoFitting
+def get_example():
+    case = [
+        [
+            'examples/bear.mp4',
+        ],
+        [
+            'examples/boat.mp4',
+        ],
+        [
+            'examples/woman-drink.mp4',
+        ],
+        [
+            'examples/corgi.mp4',
+        ],
+        [
+            'examples/yacht.mp4',
+        ],
+        [
+            'examples/koolshooters.mp4',
+        ],
+        [
+            'examples/overlook-the-ocean.mp4',
+        ],
+        [
+            'examples/rotate.mp4',
+        ],
+        [
+            'examples/shark-ocean.mp4',
+        ],
+        [
+            'examples/surf.mp4',
+        ],
+        [
+            'examples/cactus.mp4',
+        ],
+        [
+            'examples/gold-fish.mp4',
+        ]
+    ]
+    return case
+def set_default_prompt(video_name):
+    video_to_prompt = {
+        'bear.mp4': 'bear, Van Gogh Style',
+        'boat.mp4': 'a burning boat sails on lava',
+        'cactus.mp4': 'cactus, made of paper',
+        'corgi.mp4': 'a hellhound',
+        'gold-fish.mp4': 'Goldfish in the Milky Way',
+        'koolshooters.mp4': 'Avatar',
+        'overlook-the-ocean.mp4': 'ocean, pixel style',
+        'rotate.mp4': 'turbine engine',
+        'shark-ocean.mp4': 'A sleek shark, cartoon style',
+        'surf.mp4': 'Sailing, The background is a large white cloud, sketch style',
+        'woman-drink.mp4': 'a drinking zombie',
+        'yacht.mp4': 'yacht, cyberpunk style',
+    }
+    return video_to_prompt.get(video_name, '')
+def update_prompt(input_video):
+    video_name = input_video.split('/')[-1]
+    return set_default_prompt(video_name)
+# Map videos to corresponding images
+video_to_image = {
+    'bear.mp4': ['canonical/bear.png', 'pth_file/bear', 'examples_frames/bear'],
+    'boat.mp4': ['canonical/boat.png', 'pth_file/boat', 'examples_frames/boat'],
+    'cactus.mp4': ['canonical/cactus.png', 'pth_file/cactus', 'examples_frames/cactus'],
+    'corgi.mp4': ['canonical/corgi.png', 'pth_file/corgi', 'examples_frames/corgi'],
+    'gold-fish.mp4': ['canonical/gold-fish.png', 'pth_file/gold-fish', 'examples_frames/gold-fish'],
+    'koolshooters.mp4': ['canonical/koolshooters.png', 'pth_file/koolshooters', 'examples_frames/koolshooters'],
+    'overlook-the-ocean.mp4': ['canonical/overlook-the-ocean.png', 'pth_file/overlook-the-ocean', 'examples_frames/overlook-the-ocean'],
+    'rotate.mp4': ['canonical/rotate.png', 'pth_file/rotate', 'examples_frames/rotate'],
+    'shark-ocean.mp4': ['canonical/shark-ocean.png', 'pth_file/shark-ocean', 'examples_frames/shark-ocean'],
+    'surf.mp4': ['canonical/surf.png', 'pth_file/surf', 'examples_frames/surf'],
+    'woman-drink.mp4': ['canonical/woman-drink.png', 'pth_file/woman-drink', 'examples_frames/woman-drink'],
+    'yacht.mp4': ['canonical/yacht.png', 'pth_file/yacht', 'examples_frames/yacht'],
+}
+def images_to_video(image_list, output_path, fps=10):
+    # Convert PIL Images to numpy arrays
+    frames = [np.array(img).astype(np.uint8) for img in image_list]
+    frames = frames[:20]
+    # Create video writer
+    writer = imageio.get_writer(output_path, fps=fps, codec='libx264')
+    for frame in frames:
+        writer.append_data(frame)
+    writer.close()
+def NaRCan_make_video(edit_canonical, pth_path, frames_path):
+    # load NaRCan model
+    checkpoint_g_old = torch.load(os.path.join(pth_path, "homography_g.pth"))
+    checkpoint_g = torch.load(os.path.join(pth_path, "mlp_g.pth"))
+    g_old = Homography(hidden_features=256, hidden_layers=2).cuda()
+    g = Siren(in_features=3, out_features=2, hidden_features=256,
+              hidden_layers=5, outermost_linear=True).cuda()
+    g_old.load_state_dict(checkpoint_g_old)
+    g.load_state_dict(checkpoint_g)
+    g_old.eval()
+    g.eval()
+    transform = Compose([
+        Resize(512),
+        ToTensor(),
+        Normalize(torch.Tensor([0.5, 0.5, 0.5]), torch.Tensor([0.5, 0.5, 0.5]))
+    ])
+    v = TestVideoFitting(frames_path, transform)
+    videoloader = DataLoader(v, batch_size=1, pin_memory=True, num_workers=0)
+    model_input, ground_truth = next(iter(videoloader))
+    model_input, ground_truth = model_input[0].cuda(), ground_truth[0].cuda()
+    myoutput = None
+    data_len = len(os.listdir(frames_path))
+    with torch.no_grad():
+        batch_size = (v.H * v.W)
+        for step in range(data_len):
+            start = (step * batch_size) % len(model_input)
+            end = min(start + batch_size, len(model_input))
+            # get the deformation
+            xy, t = model_input[start:end, :-1], model_input[start:end, [-1]]
+            xyt = model_input[start:end]
+            h_old = apply_homography(xy, g_old(t))
+            h = g(xyt)
+            xy_ = h_old + h
+            # use canonical to reconstruct
+            w, h = v.W, v.H
+            canonical_img = np.array(edit_canonical.convert('RGB'))
+            canonical_img = torch.from_numpy(canonical_img).float().cuda()
+            h_c, w_c = canonical_img.shape[:2]
+            grid_new = xy_.clone()
+            grid_new[..., 1] = xy_[..., 0] / 1.5
+            grid_new[..., 0] = xy_[..., 1] / 2.0
+            if len(canonical_img.shape) == 3:
+                canonical_img = canonical_img.unsqueeze(0)
+            results = torch.nn.functional.grid_sample(
+                canonical_img.permute(0, 3, 1, 2),
+                grid_new.unsqueeze(1).unsqueeze(0),
+                mode='bilinear',
+                padding_mode='border')
+            o = results.squeeze().permute(1,0)
+            if step == 0:
+                myoutput = o
+            else:
+                myoutput = torch.cat([myoutput, o])
+    myoutput = myoutput.reshape(512, 512, data_len, 3).permute(2, 0, 1, 3).clone().detach().cpu().numpy().astype(np.float32)
+    # myoutput = np.clip(myoutput, -1, 1) * 0.5 + 0.5
+    for i in range(len(myoutput)):
+        myoutput[i] = Image.fromarray(np.uint8(myoutput[i])).resize((512, 512)) #854, 480
+    edit_video_path = f'NaRCan_fps_10.mp4'
+    images_to_video(myoutput, edit_video_path)
+    return edit_video_path
+def edit_with_pnp(input_video, prompt, num_steps, guidance_scale, seed, n_prompt, control_type="Lineart"):
+    video_name = input_video.split('/')[-1]
+    if video_name in video_to_image:
+        image_path = video_to_image[video_name][0]
+        pth_path = video_to_image[video_name][1]
+        frames_path = video_to_image[video_name][2]
+    else:
+        return None
+    if control_type == "Lineart":
+        # Load the control net model for lineart
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_lineart", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.to("cuda")
+        # lineart
+        processor = LineartDetector.from_pretrained("lllyasviel/Annotators")
+        processor_partial = partial(processor, coarse=False)
+        size_ = 768
+        canonical_image = Image.open(image_path)
+        ori_size = canonical_image.size
+        image = processor_partial(canonical_image.resize((size_, size_)), detect_resolution=size_, image_resolution=size_)
+        image = image.resize(ori_size, resample=Image.BILINEAR)
+        generator = torch.manual_seed(seed) if seed != -1 else None
+        output_images = pipe(
+            prompt=prompt,
+            image=image,
+            num_inference_steps=num_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=n_prompt,
+            generator=generator
+        ).images
+        # output_images[0] = output_images[0].resize(ori_size, resample=Image.BILINEAR)
+    else:
+        # Load the control net model for canny
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.to("cuda")
+        # canny
+        canonical_image = cv2.imread(image_path)
+        canonical_image = cv2.cvtColor(canonical_image, cv2.COLOR_BGR2RGB)
+        image = cv2.cvtColor(canonical_image, cv2.COLOR_RGB2GRAY)
+        image = cv2.Canny(image, 100, 200)
+        image = image[:, :, None]
+        image = np.concatenate([image, image, image], axis=2)
+        image = Image.fromarray(image)
+        generator = torch.manual_seed(seed) if seed != -1 else None
+        output_images = pipe(
+            prompt=prompt,
+            image=image,
+            num_inference_steps=num_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=n_prompt,
+            generator=generator
+        ).images
+    edit_video_path = NaRCan_make_video(output_images[0], pth_path, frames_path)
+    # Here we return the first output image as the result
+    return edit_video_path
+########
+# demo #
+########
+intro = """
+<div style="text-align:center">
+<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;">
+   NaRCan - <small>Natural Refined Canonical Image</small>
+</h1>
+<span>[<a target="_blank" href="https://koi953215.github.io/NaRCan_page/">Project page</a>], [<a target="_blank" href="https://huggingface.co/papers/2406.06523">Paper</a>]</span>
+<div style="display:flex; justify-content: center;margin-top: 0.5em">Each edit takes ~10 sec </div>
+</div>
+"""
+with gr.Blocks(css="style.css") as demo:
+    gr.HTML(intro)
+    frames = gr.State()
+    inverted_latents = gr.State()
+    latents = gr.State()
+    zs = gr.State()
+    do_inversion = gr.State(value=True)
+    with gr.Row():
+        input_video = gr.Video(label="Input Video", interactive=False, elem_id="input_video", value='examples/bear.mp4')
+        output_video = gr.Video(label="Edited Video", interactive=False, elem_id="output_video")
+        input_video.style(height=365, width=365)
+        output_video.style(height=365, width=365)
+    with gr.Row():
+            prompt = gr.Textbox(
+                            label="Describe your edited video",
+                            max_lines=1,
+                            value="bear, Van Gogh Style"
+                            # placeholder="bear, Van Gogh Style"
+                        )
+    with gr.Row():
+        run_button = gr.Button("Edit your video!", visible=True)
+    max_images = 12
+    default_num_images = 3
+    with gr.Accordion('Advanced options', open=False):
+        control_type = gr.Dropdown(
+            ["Canny", "Lineart"],
+            label="Control Type",
+            info="Canny or Lineart",
+            value="Lineart"
+        )
+        num_steps = gr.Slider(label='Steps',
+                                minimum=1,
+                                maximum=100,
+                                value=20,
+                                step=1)
+        guidance_scale = gr.Slider(label='Guidance Scale',
+                                    minimum=0.1,
+                                    maximum=30.0,
+                                    value=9.0,
+                                    step=0.1)
+        seed = gr.Slider(label='Seed',
+                            minimum=-1,
+                            maximum=2147483647,
+                            step=1,
+                            randomize=True)
+        n_prompt = gr.Textbox(
+            label='Negative Prompt',
+            value=""
+        )
+    input_video.change(
+        fn = update_prompt,
+        inputs = [input_video],
+        outputs = [prompt],
+        queue = False)
+    run_button.click(fn = edit_with_pnp,
+                     inputs = [input_video,
+                               prompt,
+                               num_steps,
+                               guidance_scale,
+                               seed,
+                               n_prompt,
+                               control_type,
+                               ],
+                                 outputs = [output_video]
+                                )
+    gr.Examples(
+        examples=get_example(),
+        label='Examples',
+        inputs=[input_video],
+        outputs=[output_video],
+        examples_per_page=8
+    )
+demo.queue()
+demo.launch(share=True)