Spaces:

guangkaixu
/

GenPercept

Running on Zero

App Files Files Community

guangkaixu commited on Apr 7, 2024

Commit

562fd4c

1 Parent(s): cdba047

upload

Browse files

Files changed (41) hide show

README.md +15 -4
app.py +289 -0
images/depth/.DS_Store +0 -0
images/depth/anime_1.jpg +0 -0
images/depth/anime_2.jpg +0 -0
images/depth/anime_3.jpg +0 -0
images/depth/anime_4.jpg +0 -0
images/depth/anime_5.jpg +0 -0
images/depth/anime_6.jpg +0 -0
images/depth/anime_7.jpg +0 -0
images/depth/line_1.jpg +0 -0
images/depth/line_2.jpg +0 -0
images/depth/line_3.jpg +0 -0
images/depth/line_4.jpg +0 -0
images/depth/line_5.jpg +0 -0
images/depth/line_6.jpg +0 -0
images/depth/real_1.jpg +0 -0
images/depth/real_10.jpg +0 -0
images/depth/real_11.jpg +0 -0
images/depth/real_12.jpg +0 -0
images/depth/real_13.jpg +0 -0
images/depth/real_14.jpg +0 -0
images/depth/real_15.jpg +0 -0
images/depth/real_16.jpg +0 -0
images/depth/real_17.jpg +0 -0
images/depth/real_18.jpg +0 -0
images/depth/real_19.jpg +0 -0
images/depth/real_2.jpg +0 -0
images/depth/real_20.jpg +0 -0
images/depth/real_21.jpg +0 -0
images/depth/real_22.jpg +0 -0
images/depth/real_23.jpg +0 -0
images/depth/real_24.jpg +0 -0
images/depth/real_3.jpg +0 -0
images/depth/real_4.jpg +0 -0
images/depth/real_5.jpg +0 -0
images/depth/real_6.jpg +0 -0
images/depth/real_7.jpg +0 -0
images/depth/real_8.jpg +0 -0
images/depth/real_9.jpg +0 -0
pipeline_genpercept.py +355 -0

README.md CHANGED Viewed

@@ -1,13 +1,24 @@
 ---
-title: GenPercept
 emoji: ⚡
 colorFrom: indigo
 colorTo: red
 sdk: gradio
 sdk_version: 4.25.0
 app_file: app.py
-pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: GenPercept: Diffusion Models Trained with Large Data Are Transferable Visual Models
 emoji: ⚡
 colorFrom: indigo
 colorTo: red
 sdk: gradio
 sdk_version: 4.25.0
 app_file: app.py
+pinned: true
+models:
+  - guangkaixu/GenPercept
+license: cc0-1.0
 ---
+If you find it useful, please cite our paper:
+```
+@article{xu2024diffusion,
+  title={Diffusion Models Trained with Large Data Are Transferable Visual Models},
+  author={Xu, Guangkai and Ge, Yongtao and Liu, Mingyu and Fan, Chengxiang and Xie, Kangyang and Zhao, Zhiyue and Chen, Hao and Shen, Chunhua},
+  journal={arXiv preprint arXiv:2403.06090},
+  year={2024}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# Copyright 2024 Guangkai Xu, Zhejiang University. All rights reserved.
+#
+# Licensed under the CC0-1.0 license;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/aim-uofa/GenPercept/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# This code is based on Marigold and diffusers codebases
+# https://github.com/prs-eth/marigold
+# https://github.com/huggingface/diffusers
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/aim-uofa/GenPercept#%EF%B8%8F-citation
+# More information about the method can be found at https://github.com/aim-uofa/GenPercept
+# --------------------------------------------------------------------------
+from __future__ import annotations
+import functools
+import os
+import tempfile
+import warnings
+import gradio as gr
+import numpy as np
+import spaces
+import torch as torch
+from PIL import Image
+from gradio_imageslider import ImageSlider
+from gradio_patches.examples import Examples
+from pipeline_genpercept import GenPerceptPipeline
+warnings.filterwarnings(
+    "ignore", message=".*LoginButton created outside of a Blocks context.*"
+)
+default_image_processing_res = 768
+default_image_reproducuble = True
+def process_image_check(path_input):
+    if path_input is None:
+        raise gr.Error(
+            "Missing image in the first pane: upload a file or use one from the gallery below."
+        )
+def process_image(
+    pipe,
+    path_input,
+    processing_res=default_image_processing_res,
+):
+    name_base, name_ext = os.path.splitext(os.path.basename(path_input))
+    print(f"Processing image {name_base}{name_ext}")
+    path_output_dir = tempfile.mkdtemp()
+    path_out_fp32 = os.path.join(path_output_dir, f"{name_base}_depth_fp32.npy")
+    path_out_16bit = os.path.join(path_output_dir, f"{name_base}_depth_16bit.png")
+    path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.png")
+    input_image = Image.open(path_input)
+    pipe_out = pipe(
+        input_image,
+        processing_res=processing_res,
+        batch_size=1 if processing_res == 0 else 0,
+        show_progress_bar=False,
+    )
+    depth_pred = pipe_out.depth_np
+    depth_colored = pipe_out.depth_colored
+    depth_16bit = (depth_pred * 65535.0).astype(np.uint16)
+    np.save(path_out_fp32, depth_pred)
+    Image.fromarray(depth_16bit).save(path_out_16bit, mode="I;16")
+    depth_colored.save(path_out_vis)
+    return (
+        [path_out_16bit, path_out_vis],
+        [path_out_16bit, path_out_fp32, path_out_vis],
+    )
+def run_demo_server(pipe):
+    process_pipe_image = spaces.GPU(functools.partial(process_image, pipe))
+    process_pipe_video = spaces.GPU(
+        functools.partial(process_video, pipe), duration=120
+    )
+    process_pipe_bas = spaces.GPU(functools.partial(process_bas, pipe))
+    gradio_theme = gr.themes.Default()
+    with gr.Blocks(
+        theme=gradio_theme,
+        title="GenPercept",
+        css="""
+            #download {
+                height: 118px;
+            }
+            .slider .inner {
+                width: 5px;
+                background: #FFF;
+            }
+            .viewport {
+                aspect-ratio: 4/3;
+            }
+            .tabs button.selected {
+                font-size: 20px !important;
+                color: crimson !important;
+            }
+            h1 {
+                text-align: center;
+                display: block;
+            }
+            h2 {
+                text-align: center;
+                display: block;
+            }
+            h3 {
+                text-align: center;
+                display: block;
+            }
+            .md_feedback li {
+                margin-bottom: 0px !important;
+            }
+        """,
+        head="""
+            <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
+            <script>
+                window.dataLayer = window.dataLayer || [];
+                function gtag() {dataLayer.push(arguments);}
+                gtag('js', new Date());
+                gtag('config', 'G-1FWSVCGZTG');
+            </script>
+        """,
+    ) as demo:
+        gr.Markdown(
+            """
+            # GenPercept: Diffusion Models Trained with Large Data Are Transferable Visual Models
+            <p align="center">
+            <a title="arXiv" href="https://arxiv.org/abs/2403.06090" target="_blank" rel="noopener noreferrer"
+                    style="display: inline-block;">
+                <img src="https://www.obukhov.ai/img/badges/badge-pdf.svg">
+            </a>
+            <a title="Github" href="https://github.com/aim-uofa/GenPercept" target="_blank" rel="noopener noreferrer"
+                    style="display: inline-block;">
+                <img src="https://img.shields.io/github/stars/aim-uofa/GenPercept?label=GitHub%20%E2%98%85&logo=github&color=C8C"
+                        alt="badge-github-stars">
+            </a>
+            </p>
+            <p align="justify">
+                GenPercept leverages the prior knowledge of stable diffusion models to estimate detailed visual perception results.
+                It achieve remarkable transferable performance on fundamental vision perception tasks using a moderate amount of target data
+                (even synthetic data only). Compared to previous methods, our inference process only requires one step and therefore runs faster.
+            </p>
+        """
+        )
+        with gr.Tabs(elem_classes=["tabs"]):
+            with gr.Tab("Depth Estimation"):
+                with gr.Row():
+                    with gr.Column():
+                        image_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                        )
+                        with gr.Row():
+                            image_submit_btn = gr.Button(
+                                value="Estimate Depth", variant="primary"
+                            )
+                            image_reset_btn = gr.Button(value="Reset")
+                        with gr.Accordion("Advanced options", open=False):
+                            image_processing_res = gr.Radio(
+                                [
+                                    ("Native", 0),
+                                    ("Recommended", 768),
+                                ],
+                                label="Processing resolution",
+                                value=default_image_processing_res,
+                            )
+                    with gr.Column():
+                        image_output_slider = ImageSlider(
+                            label="Predicted depth of gray / color (red-near, blue-far)",
+                            type="filepath",
+                            show_download_button=True,
+                            show_share_button=True,
+                            interactive=False,
+                            elem_classes="slider",
+                            position=0.25,
+                        )
+                        image_output_files = gr.Files(
+                            label="Depth outputs",
+                            elem_id="download",
+                            interactive=False,
+                        )
+                filenames = []
+                filenames.extend(["anime_%d.jpg" %i+1 for i in range(7)])
+                filenames.extend(["line_%d.jpg" %i+1 for i in range(6)])
+                filenames.extend(["real_%d.jpg" %i+1 for i in range(24)])
+                Examples(
+                    fn=process_pipe_image,
+                    examples=[
+                        os.path.join("images", "depth", name)
+                        for name in filenames
+                    ],
+                    inputs=[image_input],
+                    outputs=[image_output_slider, image_output_files],
+                    cache_examples=True,
+                    directory_name="examples_image",
+                )
+        ### Image tab
+        image_submit_btn.click(
+            fn=process_image_check,
+            inputs=image_input,
+            outputs=None,
+            preprocess=False,
+            queue=False,
+        ).success(
+            fn=process_pipe_image,
+            inputs=[
+                image_input,
+                image_processing_res,
+            ],
+            outputs=[image_output_slider, image_output_files],
+            concurrency_limit=1,
+        )
+        image_reset_btn.click(
+            fn=lambda: (
+                None,
+                None,
+                None,
+                default_image_processing_res,
+            ),
+            inputs=[],
+            outputs=[
+                image_input,
+                image_output_slider,
+                image_output_files,
+                image_processing_res,
+            ],
+            queue=False,
+        )
+        ### Server launch
+        demo.queue(
+            api_open=False,
+        ).launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+        )
+def main():
+    os.system("pip freeze")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    vae = AutoencoderKL.from_pretrained("./", subfolder='vae')
+    unet = UNet2DConditionModel.from_pretrained('./', subfolder="unet")
+    empty_text_embed = torch.from_numpy(np.load("./empty_text_embed.npy")).to(device, dtype)[None] # [1, 77, 1024]
+    pipe = GenPerceptPipeline(vae=vae,
+                              unet=unet,
+                              empty_text_embed=empty_text_embed)
+    try:
+        import xformers
+        pipe.enable_xformers_memory_efficient_attention()
+    except:
+        pass  # run without xformers
+    pipe = pipe.to(device)
+    run_demo_server(pipe)
+if __name__ == "__main__":
+    main()

images/depth/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

images/depth/anime_1.jpg ADDED Viewed

images/depth/anime_2.jpg ADDED Viewed

images/depth/anime_3.jpg ADDED Viewed

images/depth/anime_4.jpg ADDED Viewed

images/depth/anime_5.jpg ADDED Viewed

images/depth/anime_6.jpg ADDED Viewed

images/depth/anime_7.jpg ADDED Viewed

images/depth/line_1.jpg ADDED Viewed

images/depth/line_2.jpg ADDED Viewed

images/depth/line_3.jpg ADDED Viewed

images/depth/line_4.jpg ADDED Viewed

images/depth/line_5.jpg ADDED Viewed

images/depth/line_6.jpg ADDED Viewed

images/depth/real_1.jpg ADDED Viewed

images/depth/real_10.jpg ADDED Viewed

images/depth/real_11.jpg ADDED Viewed

images/depth/real_12.jpg ADDED Viewed

images/depth/real_13.jpg ADDED Viewed

images/depth/real_14.jpg ADDED Viewed

images/depth/real_15.jpg ADDED Viewed

images/depth/real_16.jpg ADDED Viewed

images/depth/real_17.jpg ADDED Viewed

images/depth/real_18.jpg ADDED Viewed

images/depth/real_19.jpg ADDED Viewed

images/depth/real_2.jpg ADDED Viewed

images/depth/real_20.jpg ADDED Viewed

images/depth/real_21.jpg ADDED Viewed

images/depth/real_22.jpg ADDED Viewed

images/depth/real_23.jpg ADDED Viewed

images/depth/real_24.jpg ADDED Viewed

images/depth/real_3.jpg ADDED Viewed

images/depth/real_4.jpg ADDED Viewed

images/depth/real_5.jpg ADDED Viewed

images/depth/real_6.jpg ADDED Viewed

images/depth/real_7.jpg ADDED Viewed

images/depth/real_8.jpg ADDED Viewed

images/depth/real_9.jpg ADDED Viewed

pipeline_genpercept.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# --------------------------------------------------------
+# Diffusion Models Trained with Large Data Are Transferable Visual Models (https://arxiv.org/abs/2403.06090)
+# Github source: https://github.com/aim-uofa/GenPercept
+# Copyright (c) 2024 Zhejiang University
+# Licensed under The CC0 1.0 License [see LICENSE for details]
+# By Guangkai Xu
+# Based on Marigold, diffusers codebases
+# https://github.com/prs-eth/marigold
+# https://github.com/huggingface/diffusers
+# --------------------------------------------------------
+import torch
+import numpy as np
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+from PIL import Image
+from typing import List, Dict, Union
+from torch.utils.data import DataLoader, TensorDataset
+from diffusers import (
+    DiffusionPipeline,
+    UNet2DConditionModel,
+    AutoencoderKL,
+)
+from diffusers.utils import BaseOutput
+from .util.image_util import chw2hwc, colorize_depth_maps, resize_max_res, norm_to_rgb, resize_res
+from .util.batchsize import find_batch_size
+class GenPerceptOutput(BaseOutput):
+    pred_np: np.ndarray
+    pred_colored: Image.Image
+class GenPerceptPipeline(DiffusionPipeline):
+    vae_scale_factor = 0.18215
+    task_infos = {
+        'depth':    dict(task_channel_num=1, interpolate='bilinear', ),
+        'seg':      dict(task_channel_num=3, interpolate='nearest', ),
+        'sr':       dict(task_channel_num=3, interpolate='nearest', ),
+        'normal':   dict(task_channel_num=3, interpolate='bilinear', ),
+    }
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        customized_head=None,
+        empty_text_embed=None,
+    ):
+        super().__init__()
+        self.empty_text_embed = empty_text_embed
+        # register
+        register_dict = dict(
+            unet=unet,
+            vae=vae,
+            customized_head=customized_head,
+        )
+        self.register_modules(**register_dict)
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: Union[Image.Image, torch.Tensor],
+        mode: str = 'depth',
+        resize_hard = False,
+        processing_res: int = 768,
+        match_input_res: bool = True,
+        batch_size: int = 0,
+        color_map: str = "Spectral",
+        show_progress_bar: bool = True,
+    ) -> GenPerceptOutput:
+        """
+        Function invoked when calling the pipeline.
+        Args:
+            input_image (Image):
+                Input RGB (or gray-scale) image.
+            processing_res (int, optional):
+                Maximum resolution of processing.
+                If set to 0: will not resize at all.
+                Defaults to 768.
+            match_input_res (bool, optional):
+                Resize depth prediction to match input resolution.
+                Only valid if `limit_input_res` is not None.
+                Defaults to True.
+            batch_size (int, optional):
+                Inference batch size.
+                If set to 0, the script will automatically decide the proper batch size.
+                Defaults to 0.
+            show_progress_bar (bool, optional):
+                Display a progress bar of diffusion denoising.
+                Defaults to True.
+            color_map (str, optional):
+                Colormap used to colorize the depth map.
+                Defaults to "Spectral".
+        Returns:
+            `GenPerceptOutput`
+        """
+        device = self.device
+        task_channel_num = self.task_infos[mode]['task_channel_num']
+        if not match_input_res:
+            assert (
+                processing_res is not None
+            ), "Value error: `resize_output_back` is only valid with "
+        assert processing_res >= 0
+        # ----------------- Image Preprocess -----------------
+        if type(input_image) == torch.Tensor: # [B, 3, H, W]
+            rgb_norm = input_image.to(device)
+            input_size = input_image.shape[2:]
+            bs_imgs = rgb_norm.shape[0]
+            assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
+            rgb_norm = rgb_norm.to(self.dtype)
+        else:
+            # if len(rgb_paths) > 0 and 'kitti' in rgb_paths[0]:
+            #     # kb crop
+            #     height = input_image.size[1]
+            #     width = input_image.size[0]
+            #     top_margin = int(height - 352)
+            #     left_margin = int((width - 1216) / 2)
+            #     input_image = input_image.crop((left_margin, top_margin, left_margin + 1216, top_margin + 352))
+            # TODO: check the kitti evaluation resolution here.
+            input_size = (input_image.size[1], input_image.size[0])
+            # Resize image
+            if processing_res > 0:
+                if resize_hard:
+                    input_image = resize_res(
+                        input_image, max_edge_resolution=processing_res
+                    )
+                else:
+                    input_image = resize_max_res(
+                        input_image, max_edge_resolution=processing_res
+                    )
+            input_image = input_image.convert("RGB")
+            image = np.asarray(input_image)
+            # Normalize rgb values
+            rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
+            rgb_norm = rgb / 255.0 * 2.0 - 1.0
+            rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
+            rgb_norm = rgb_norm[None].to(device)
+            assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
+            bs_imgs = 1
+        # ----------------- Predicting depth -----------------
+        single_rgb_dataset = TensorDataset(rgb_norm)
+        if batch_size > 0:
+            _bs = batch_size
+        else:
+            _bs = find_batch_size(
+                ensemble_size=1,
+                input_res=max(rgb_norm.shape[1:]),
+                dtype=self.dtype,
+            )
+        single_rgb_loader = DataLoader(
+            single_rgb_dataset, batch_size=_bs, shuffle=False
+        )
+        # Predict depth maps (batched)
+        pred_list = []
+        if show_progress_bar:
+            iterable = tqdm(
+                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
+            )
+        else:
+            iterable = single_rgb_loader
+        for batch in iterable:
+            (batched_img, ) = batch
+            pred = self.single_infer(
+                rgb_in=batched_img,
+                mode=mode,
+            )
+            pred_list.append(pred.detach().clone())
+        preds = torch.concat(pred_list, axis=0).squeeze() # [bs_imgs, task_channel_num, H, W]
+        preds = preds.view(bs_imgs, task_channel_num, preds.shape[-2], preds.shape[-1])
+        if match_input_res:
+            preds = F.interpolate(preds, input_size, mode=self.task_infos[mode]['interpolate'])
+        # ----------------- Post processing -----------------
+        if mode == 'depth':
+            if len(preds.shape) == 4:
+                preds = preds[:, 0] # [bs_imgs, H, W]
+            # Scale prediction to [0, 1]
+            min_d = preds.view(bs_imgs, -1).min(dim=1)[0]
+            max_d = preds.view(bs_imgs, -1).max(dim=1)[0]
+            preds = (preds - min_d[:, None, None]) / (max_d[:, None, None] - min_d[:, None, None])
+            preds = preds.cpu().numpy().astype(np.float32)
+            # Colorize
+            pred_colored_img_list = []
+            for i in range(bs_imgs):
+                pred_colored_chw = colorize_depth_maps(
+                    preds[i], 0, 1, cmap=color_map
+                ).squeeze()  # [3, H, W], value in (0, 1)
+                pred_colored_chw = (pred_colored_chw * 255).astype(np.uint8)
+                pred_colored_hwc = chw2hwc(pred_colored_chw)
+                pred_colored_img = Image.fromarray(pred_colored_hwc)
+                pred_colored_img_list.append(pred_colored_img)
+            return GenPerceptOutput(
+                pred_np=np.squeeze(preds),
+                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
+            )
+        elif mode == 'seg' or mode == 'sr':
+            if not self.customized_head:
+                # shift to [0, 1]
+                preds = (preds + 1.0) / 2.0
+                # shift to [0, 255]
+                preds = preds * 255
+                # Clip output range
+                preds = preds.clip(0, 255).cpu().numpy().astype(np.uint8)
+            else:
+                raise NotImplementedError
+            pred_colored_img_list = []
+            for i in range(preds.shape[0]):
+                pred_colored_hwc = chw2hwc(preds[i])
+                pred_colored_img = Image.fromarray(pred_colored_hwc)
+                pred_colored_img_list.append(pred_colored_img)
+            return GenPerceptOutput(
+                pred_np=np.squeeze(preds),
+                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
+            )
+        elif mode == 'normal':
+            if not self.customized_head:
+                preds = preds.clip(-1, 1).cpu().numpy() # [-1, 1]
+            else:
+                raise NotImplementedError
+            pred_colored_img_list = []
+            for i in range(preds.shape[0]):
+                pred_colored_chw = norm_to_rgb(preds[i])
+                pred_colored_hwc = chw2hwc(pred_colored_chw)
+                normal_colored_img_i = Image.fromarray(pred_colored_hwc)
+                pred_colored_img_list.append(normal_colored_img_i)
+            return GenPerceptOutput(
+                pred_np=np.squeeze(preds),
+                pred_colored=pred_colored_img_list[0] if len(pred_colored_img_list) == 1 else pred_colored_img_list,
+            )
+        else:
+            raise NotImplementedError
+    @torch.no_grad()
+    def single_infer(
+        self,
+        rgb_in: torch.Tensor,
+        mode: str = 'depth',
+    ) -> torch.Tensor:
+        """
+        Perform an individual depth prediction without ensembling.
+        Args:
+            rgb_in (torch.Tensor):
+                Input RGB image.
+            num_inference_steps (int):
+                Number of diffusion denoising steps (DDIM) during inference.
+            show_pbar (bool):
+                Display a progress bar of diffusion denoising.
+        Returns:
+            torch.Tensor: Predicted depth map.
+        """
+        device = rgb_in.device
+        bs_imgs = rgb_in.shape[0]
+        timesteps = torch.tensor([1]).long().repeat(bs_imgs).to(device)
+        # Encode image
+        rgb_latent = self.encode_rgb(rgb_in)
+        batch_embed = self.empty_text_embed
+        batch_embed = batch_embed.repeat((rgb_latent.shape[0], 1, 1)).to(device)   # [bs_imgs, 77, 1024]
+        # Forward!
+        if self.customized_head:
+            unet_features = self.unet(rgb_latent, timesteps, encoder_hidden_states=batch_embed, return_feature_only=True)[0][::-1]
+            pred = self.customized_head(unet_features)
+        else:
+            unet_output = self.unet(
+                rgb_latent, timesteps, encoder_hidden_states=batch_embed
+            )  # [bs_imgs, 4, h, w]
+            unet_pred = unet_output.sample
+            pred_latent = - unet_pred
+            pred_latent.to(device)
+            pred = self.decode_pred(pred_latent)
+            if mode == 'depth':
+                # mean of output channels
+                pred = pred.mean(dim=1, keepdim=True)
+            # clip prediction
+            pred = torch.clip(pred, -1.0, 1.0)
+        return pred
+    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+        """
+        Encode RGB image into latent.
+        Args:
+            rgb_in (torch.Tensor):
+                Input RGB image to be encoded.
+        Returns:
+            torch.Tensor: Image latent
+        """
+        try:
+            # encode
+            h_temp = self.vae.encoder(rgb_in)
+            moments = self.vae.quant_conv(h_temp)
+        except:
+            # encode
+            h_temp = self.vae.encoder(rgb_in.float())
+            moments = self.vae.quant_conv(h_temp.float())
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        # scale latent
+        rgb_latent = mean * self.vae_scale_factor
+        return rgb_latent
+    def decode_pred(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        """
+        Decode pred latent into pred label.
+        Args:
+            pred_latent (torch.Tensor):
+                prediction latent to be decoded.
+        Returns:
+            torch.Tensor: Decoded prediction label.
+        """
+        # scale latent
+        pred_latent = pred_latent / self.vae_scale_factor
+        # decode
+        z = self.vae.post_quant_conv(pred_latent)
+        pred = self.vae.decoder(z)
+        return pred