Spaces:

Stable-X
/

StableDelight

Running on Zero

App Files Files Community

Stable-X commited on Jun 15, 2024

Commit

508279d

1 Parent(s): c0e046b

Update code

Browse files

Files changed (8) hide show

README.md +11 -9
app.py +413 -0
requirements.txt +132 -0
requirements_min.txt +17 -0
stablediffuse/__init__.py +0 -0
stablediffuse/__pycache__/__init__.cpython-39.pyc +0 -0
stablediffuse/__pycache__/pipeline_yoso_diffuse.cpython-39.pyc +0 -0
stablediffuse/pipeline_yoso_diffuse.py +724 -0

README.md CHANGED Viewed

@@ -1,13 +1,15 @@
 ---
-title: StableDiffuse
-emoji: 👁
-colorFrom: red
-colorTo: yellow
 sdk: gradio
-sdk_version: 4.36.1
 app_file: app.py
-pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: StableDiffuse: Removing Reflections from Textured Surfaces in a Single Image
+emoji: 🏵️
+colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 4.32.2
 app_file: app.py
+pinned: true
+license: cc-by-sa-4.0
+models:
+- Stable-X/yoso-diffuse-v0-2
+hf_oauth: true
+hf_oauth_expiration_minutes: 43200
 ---

app.py ADDED Viewed

	@@ -0,0 +1,413 @@

+# Copyright 2024 Anton Obukhov, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+from __future__ import annotations
+import functools
+import os
+import tempfile
+import diffusers
+import gradio as gr
+import imageio as imageio
+import numpy as np
+import spaces
+import torch as torch
+torch.backends.cuda.matmul.allow_tf32 = True
+from PIL import Image
+from gradio_imageslider import ImageSlider
+from tqdm import tqdm
+from pathlib import Path
+import gradio
+from gradio.utils import get_cache_folder
+from stablediffuse.pipeline_yoso_diffuse import YOSODiffusePipeline
+class Examples(gradio.helpers.Examples):
+    def __init__(self, *args, directory_name=None, **kwargs):
+        super().__init__(*args, **kwargs, _initiated_directly=False)
+        if directory_name is not None:
+            self.cached_folder = get_cache_folder() / directory_name
+            self.cached_file = Path(self.cached_folder) / "log.csv"
+        self.create()
+default_seed = 2024
+default_batch_size = 1
+default_image_processing_resolution = 768
+default_video_num_inference_steps = 10
+default_video_processing_resolution = 768
+default_video_out_max_frames = 60
+def process_image_check(path_input):
+    if path_input is None:
+        raise gr.Error(
+            "Missing image in the first pane: upload a file or use one from the gallery below."
+        )
+def resize_image(input_image, resolution):
+    # Ensure input_image is a PIL Image object
+    if not isinstance(input_image, Image.Image):
+        raise ValueError("input_image should be a PIL Image object")
+    # Convert image to numpy array
+    input_image_np = np.asarray(input_image)
+    # Get image dimensions
+    H, W, C = input_image_np.shape
+    H = float(H)
+    W = float(W)
+    # Calculate the scaling factor
+    k = float(resolution) / min(H, W)
+    # Determine new dimensions
+    H *= k
+    W *= k
+    H = int(np.round(H / 64.0)) * 64
+    W = int(np.round(W / 64.0)) * 64
+    # Resize the image using PIL's resize method
+    img = input_image.resize((W, H), Image.Resampling.LANCZOS)
+    return img
+def process_image(
+    pipe,
+    path_input,
+):
+    name_base, name_ext = os.path.splitext(os.path.basename(path_input))
+    print(f"Processing image {name_base}{name_ext}")
+    path_output_dir = tempfile.mkdtemp()
+    path_out_png = os.path.join(path_output_dir, f"{name_base}_diffuse.png")
+    input_image = Image.open(path_input)
+    input_image = resize_image(input_image, default_image_processing_resolution)
+    pipe_out = pipe(
+        input_image,
+        match_input_resolution=False,
+        processing_resolution=max(input_image.size)
+    )
+    processed_frame = (pipe_out.prediction.clip(-1, 1) + 1) / 2
+    processed_frame = (processed_frame[0] * 255).astype(np.uint8)
+    processed_frame = Image.fromarray(processed_frame)
+    processed_frame.save(path_out_png)
+    yield [input_image, path_out_png]
+def center_crop(img):
+    # Open the image file
+    img_width, img_height = img.size
+    crop_width =min(img_width, img_height)
+    # Calculate the cropping box
+    left = (img_width - crop_width) / 2
+    top = (img_height - crop_width) / 2
+    right = (img_width + crop_width) / 2
+    bottom = (img_height + crop_width) / 2
+    # Crop the image
+    img_cropped = img.crop((left, top, right, bottom))
+    return img_cropped
+def process_video(
+    pipe,
+    path_input,
+    out_max_frames=default_video_out_max_frames,
+    target_fps=10,
+    progress=gr.Progress(),
+):
+    if path_input is None:
+        raise gr.Error(
+            "Missing video in the first pane: upload a file or use one from the gallery below."
+        )
+    name_base, name_ext = os.path.splitext(os.path.basename(path_input))
+    print(f"Processing video {name_base}{name_ext}")
+    path_output_dir = tempfile.mkdtemp()
+    path_out_vis = os.path.join(path_output_dir, f"{name_base}_diffuse_colored.mp4")
+    init_latents = None
+    reader, writer = None, None
+    try:
+        reader = imageio.get_reader(path_input)
+        meta_data = reader.get_meta_data()
+        fps = meta_data["fps"]
+        size = meta_data["size"]
+        duration_sec = meta_data["duration"]
+        writer = imageio.get_writer(path_out_vis, fps=target_fps)
+        out_frame_id = 0
+        pbar = tqdm(desc="Processing Video", total=duration_sec)
+        for frame_id, frame in enumerate(reader):
+            if frame_id % (fps // target_fps) != 0:
+                continue
+            else:
+                out_frame_id += 1
+                pbar.update(1)
+            if out_frame_id > out_max_frames:
+                break
+            frame_pil = Image.fromarray(frame)
+            # frame_pil = center_crop(frame_pil)
+            pipe_out = pipe(
+                frame_pil,
+                match_input_resolution=False,
+                latents=init_latents
+            )
+            if init_latents is None:
+                init_latents = pipe_out.gaus_noise
+            processed_frame = (pipe_out.prediction.clip(-1, 1) + 1) / 2
+            processed_frame = processed_frame[0]
+            _processed_frame = imageio.core.util.Array(processed_frame)
+            writer.append_data(_processed_frame)
+            yield (
+                [frame_pil, processed_frame],
+                None,
+            )
+    finally:
+        if writer is not None:
+            writer.close()
+        if reader is not None:
+            reader.close()
+    yield (
+        [frame_pil, processed_frame],
+        [path_out_vis,]
+    )
+def run_demo_server(pipe):
+    process_pipe_image = spaces.GPU(functools.partial(process_image, pipe))
+    process_pipe_video = spaces.GPU(
+        functools.partial(process_video, pipe), duration=120
+    )
+    gradio_theme = gr.themes.Default()
+    with gr.Blocks(
+        theme=gradio_theme,
+        title="Stable Diffuse Estimation",
+        css="""
+            #download {
+                height: 118px;
+            }
+            .slider .inner {
+                width: 5px;
+                background: #FFF;
+            }
+            .viewport {
+                aspect-ratio: 4/3;
+            }
+            .tabs button.selected {
+                font-size: 20px !important;
+                color: crimson !important;
+            }
+            h1 {
+                text-align: center;
+                display: block;
+            }
+            h2 {
+                text-align: center;
+                display: block;
+            }
+            h3 {
+                text-align: center;
+                display: block;
+            }
+            .md_feedback li {
+                margin-bottom: 0px !important;
+            }
+        """,
+        head="""
+            <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
+            <script>
+                window.dataLayer = window.dataLayer || [];
+                function gtag() {dataLayer.push(arguments);}
+                gtag('js', new Date());
+                gtag('config', 'G-1FWSVCGZTG');
+            </script>
+        """,
+    ) as demo:
+        gr.Markdown(
+            """
+            # StableDiffuse: Removing Reflections from Textured Surfaces in a Single Image
+            <p align="center">
+        """
+        )
+        with gr.Tabs(elem_classes=["tabs"]):
+            with gr.Tab("Image"):
+                with gr.Row():
+                    with gr.Column():
+                        image_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                        )
+                        with gr.Row():
+                            image_submit_btn = gr.Button(
+                                value="Compute Diffuse", variant="primary"
+                            )
+                            image_reset_btn = gr.Button(value="Reset")
+                    with gr.Column():
+                        image_output_slider = ImageSlider(
+                            label="Diffuse outputs",
+                            type="filepath",
+                            show_download_button=True,
+                            show_share_button=True,
+                            interactive=False,
+                            elem_classes="slider",
+                            position=0.25,
+                        )
+                Examples(
+                    fn=process_pipe_image,
+                    examples=sorted([
+                        os.path.join("files", "image", name)
+                        for name in os.listdir(os.path.join("files", "image"))
+                    ]),
+                    inputs=[image_input],
+                    outputs=[image_output_slider],
+                    cache_examples=False,
+                    directory_name="examples_image",
+                )
+            with gr.Tab("Video"):
+                with gr.Row():
+                    with gr.Column():
+                        video_input = gr.Video(
+                            label="Input Video",
+                            sources=["upload", "webcam"],
+                        )
+                        with gr.Row():
+                            video_submit_btn = gr.Button(
+                                value="Compute Diffuse", variant="primary"
+                            )
+                            video_reset_btn = gr.Button(value="Reset")
+                    with gr.Column():
+                        processed_frames = ImageSlider(
+                            label="Realtime Visualization",
+                            type="filepath",
+                            show_download_button=True,
+                            show_share_button=True,
+                            interactive=False,
+                            elem_classes="slider",
+                            position=0.25,
+                        )
+                        video_output_files = gr.Files(
+                            label="Diffuse outputs",
+                            elem_id="download",
+                            interactive=False,
+                        )
+                Examples(
+                    fn=process_pipe_video,
+                    examples=sorted([
+                        os.path.join("files", "video", name)
+                        for name in os.listdir(os.path.join("files", "video"))
+                    ]),
+                    inputs=[video_input],
+                    outputs=[processed_frames, video_output_files],
+                    directory_name="examples_video",
+                    cache_examples=False,
+                )
+        ### Image tab
+        image_submit_btn.click(
+            fn=process_image_check,
+            inputs=image_input,
+            outputs=None,
+            preprocess=False,
+            queue=False,
+        ).success(
+            fn=process_pipe_image,
+            inputs=[
+                image_input,
+            ],
+            outputs=[image_output_slider],
+            concurrency_limit=1,
+        )
+        image_reset_btn.click(
+            fn=lambda: (
+                None,
+                None,
+                None,
+            ),
+            inputs=[],
+            outputs=[
+                image_input,
+                image_output_slider,
+            ],
+            queue=False,
+        )
+        ### Video tab
+        video_submit_btn.click(
+            fn=process_pipe_video,
+            inputs=[video_input],
+            outputs=[processed_frames, video_output_files],
+            concurrency_limit=1,
+        )
+        video_reset_btn.click(
+            fn=lambda: (None, None, None),
+            inputs=[],
+            outputs=[video_input, processed_frames, video_output_files],
+            concurrency_limit=1,
+        )
+        ### Server launch
+        demo.queue(
+            api_open=False,
+        ).launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+        )
+def main():
+    os.system("pip freeze")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    pipe = YOSODiffusePipeline.from_pretrained(
+        'weights/yoso-diffuse-v0-2', trust_remote_code=True, variant="fp16",
+        torch_dtype=torch.float16, t_start=0).to(device)
+    try:
+        import xformers
+        pipe.enable_xformers_memory_efficient_attention()
+    except:
+        pass  # run without xformers
+    run_demo_server(pipe)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,132 @@

+accelerate==0.30.1
+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+async-timeout==4.0.3
+attrs==23.2.0
+Authlib==1.3.0
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.0.4
+contourpy==1.2.1
+cryptography==42.0.7
+cycler==0.12.1
+dataclasses-json==0.6.6
+datasets==2.19.1
+Deprecated==1.2.14
+diffusers==0.28.0
+dill==0.3.8
+dnspython==2.6.1
+email_validator==2.1.1
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+ffmpy==0.3.2
+filelock==3.14.0
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.3.1
+gradio==4.32.2
+gradio_client==0.17.0
+gradio_imageslider==0.0.20
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.0
+idna==3.7
+imageio==2.34.1
+imageio-ffmpeg==0.5.0
+importlib_metadata==7.1.0
+importlib_resources==6.4.0
+itsdangerous==2.2.0
+Jinja2==3.1.4
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.2
+matplotlib==3.8.2
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+networkx==3.3
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+orjson==3.10.3
+packaging==24.0
+pandas==2.2.2
+pillow==10.3.0
+protobuf==3.20.3
+psutil==5.9.8
+pyarrow==16.0.0
+pyarrow-hotfix==0.6
+pycparser==2.22
+pydantic==2.7.2
+pydantic_core==2.18.3
+pydub==0.25.1
+pygltflib==1.16.1
+Pygments==2.18.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.4.7
+safetensors==0.4.3
+scipy==1.11.4
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+spaces==0.28.3
+starlette==0.37.2
+sympy==1.12.1
+tokenizers==0.15.2
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.0
+tqdm==4.66.4
+transformers==4.36.1
+trimesh==4.0.5
+triton==2.2.0
+typer==0.12.3
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+tzdata==2024.1
+ujson==5.10.0
+urllib3==2.2.1
+uvicorn==0.30.0
+uvloop==0.19.0
+watchfiles==0.22.0
+websockets==11.0.3
+wrapt==1.16.0
+xformers==0.0.24
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.19.1
+einops==0.7.0

requirements_min.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+gradio>=4.32.1
+gradio-imageslider>=0.0.20
+pygltflib==1.16.1
+trimesh==4.0.5
+imageio
+imageio-ffmpeg
+Pillow
+einops==0.7.0
+spaces
+accelerate
+diffusers>=0.28.0
+matplotlib==3.8.2
+scipy==1.11.4
+torch==2.0.1
+transformers==4.36.1
+xformers==0.0.21

stablediffuse/__init__.py ADDED Viewed

File without changes

stablediffuse/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (162 Bytes). View file

stablediffuse/__pycache__/pipeline_yoso_diffuse.cpython-39.pyc ADDED Viewed

Binary file (24.3 kB). View file

stablediffuse/pipeline_yoso_diffuse.py ADDED Viewed

	@@ -0,0 +1,724 @@

+# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# More information and citation instructions are available on the
+# --------------------------------------------------------------------------
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+	ControlNetModel,
+)
+from diffusers.schedulers import (
+	DDIMScheduler
+)
+from diffusers.utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.controlnet import StableDiffusionControlNetPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.marigold.marigold_image_processing import MarigoldImageProcessor
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+import pdb
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+Examples:
+```py
+>>> import diffusers
+>>> import torch
+>>> pipe = diffusers.MarigoldNormalsPipeline.from_pretrained(
+...     "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
+... ).to("cuda")
+>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
+>>> normals = pipe(image)
+>>> vis = pipe.image_processor.visualize_normals(normals.prediction)
+>>> vis[0].save("einstein_normals.png")
+```
+"""
+@dataclass
+class YOSODiffuseOutput(BaseOutput):
+    """
+    Output class for Marigold monocular normals prediction pipeline.
+    Args:
+        prediction (`np.ndarray`, `torch.Tensor`):
+            Predicted normals with values in the range [-1, 1]. The shape is always $numimages \times 3 \times height
+            \times width$, regardless of whether the images were passed as a 4D array or a list.
+        uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$.
+        latent (`None`, `torch.Tensor`):
+            Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
+            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+    """
+    prediction: Union[np.ndarray, torch.Tensor]
+    latent: Union[None, torch.Tensor]
+    gaus_noise: Union[None, torch.Tensor]
+class YOSODiffusePipeline(StableDiffusionControlNetPipeline):
+    """ Pipeline for monocular normals estimation using the Marigold method: https://marigoldmonodepth.github.io.
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel]],
+        scheduler: Union[DDIMScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        default_denoising_steps: Optional[int] = 1,
+		default_processing_resolution: Optional[int] = 768,
+        prompt="",
+        empty_text_embedding=None,
+        t_start: Optional[int] = 401,
+    ):
+        super().__init__(
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            controlnet,
+            scheduler,
+            safety_checker,
+            feature_extractor,
+            image_encoder,
+            requires_safety_checker,
+                )
+        # TODO yoso ImageProcessor
+        self.image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.control_image_processor = MarigoldImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_denoising_steps = default_denoising_steps
+        self.default_processing_resolution = default_processing_resolution
+        self.prompt = prompt
+        self.prompt_embeds = None
+        self.empty_text_embedding = empty_text_embedding
+        self.t_start= t_start # target_out latents
+    def check_inputs(
+        self,
+        image: PipelineImageInput,
+        num_inference_steps: int,
+        ensemble_size: int,
+        processing_resolution: int,
+        resample_method_input: str,
+        resample_method_output: str,
+        batch_size: int,
+        ensembling_kwargs: Optional[Dict[str, Any]],
+        latents: Optional[torch.Tensor],
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        output_type: str,
+        output_uncertainty: bool,
+    ) -> int:
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
+        if num_inference_steps < 1:
+            raise ValueError("`num_inference_steps` must be positive.")
+        if ensemble_size < 1:
+            raise ValueError("`ensemble_size` must be positive.")
+        if ensemble_size == 2:
+            logger.warning(
+                "`ensemble_size` == 2 results are similar to no ensembling (1); "
+                "consider increasing the value to at least 3."
+            )
+        if ensemble_size == 1 and output_uncertainty:
+            raise ValueError(
+                "Computing uncertainty by setting `output_uncertainty=True` also requires setting `ensemble_size` "
+                "greater than 1."
+            )
+        if processing_resolution is None:
+            raise ValueError(
+                "`processing_resolution` is not specified and could not be resolved from the model config."
+            )
+        if processing_resolution < 0:
+            raise ValueError(
+                "`processing_resolution` must be non-negative: 0 for native resolution, or any positive value for "
+                "downsampled processing."
+            )
+        if processing_resolution % self.vae_scale_factor != 0:
+            raise ValueError(f"`processing_resolution` must be a multiple of {self.vae_scale_factor}.")
+        if resample_method_input not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_input` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if resample_method_output not in ("nearest", "nearest-exact", "bilinear", "bicubic", "area"):
+            raise ValueError(
+                "`resample_method_output` takes string values compatible with PIL library: "
+                "nearest, nearest-exact, bilinear, bicubic, area."
+            )
+        if batch_size < 1:
+            raise ValueError("`batch_size` must be positive.")
+        if output_type not in ["pt", "np"]:
+            raise ValueError("`output_type` must be one of `pt` or `np`.")
+        if latents is not None and generator is not None:
+            raise ValueError("`latents` and `generator` cannot be used together.")
+        if ensembling_kwargs is not None:
+            if not isinstance(ensembling_kwargs, dict):
+                raise ValueError("`ensembling_kwargs` must be a dictionary.")
+            if "reduction" in ensembling_kwargs and ensembling_kwargs["reduction"] not in ("closest", "mean"):
+                raise ValueError("`ensembling_kwargs['reduction']` can be either `'closest'` or `'mean'`.")
+        # image checks
+        num_images = 0
+        W, H = None, None
+        if not isinstance(image, list):
+            image = [image]
+        for i, img in enumerate(image):
+            if isinstance(img, np.ndarray) or torch.is_tensor(img):
+                if img.ndim not in (2, 3, 4):
+                    raise ValueError(f"`image[{i}]` has unsupported dimensions or shape: {img.shape}.")
+                H_i, W_i = img.shape[-2:]
+                N_i = 1
+                if img.ndim == 4:
+                    N_i = img.shape[0]
+            elif isinstance(img, Image.Image):
+                W_i, H_i = img.size
+                N_i = 1
+            else:
+                raise ValueError(f"Unsupported `image[{i}]` type: {type(img)}.")
+            if W is None:
+                W, H = W_i, H_i
+            elif (W, H) != (W_i, H_i):
+                raise ValueError(
+                    f"Input `image[{i}]` has incompatible dimensions {(W_i, H_i)} with the previous images {(W, H)}"
+                )
+            num_images += N_i
+        # latents checks
+        if latents is not None:
+            if not torch.is_tensor(latents):
+                raise ValueError("`latents` must be a torch.Tensor.")
+            if latents.dim() != 4:
+                raise ValueError(f"`latents` has unsupported dimensions or shape: {latents.shape}.")
+            if processing_resolution > 0:
+                max_orig = max(H, W)
+                new_H = H * processing_resolution // max_orig
+                new_W = W * processing_resolution // max_orig
+                if new_H == 0 or new_W == 0:
+                    raise ValueError(f"Extreme aspect ratio of the input image: [{W} x {H}]")
+                W, H = new_W, new_H
+            w = (W + self.vae_scale_factor - 1) // self.vae_scale_factor
+            h = (H + self.vae_scale_factor - 1) // self.vae_scale_factor
+            shape_expected = (num_images * ensemble_size, self.vae.config.latent_channels, h, w)
+            if latents.shape != shape_expected:
+                raise ValueError(f"`latents` has unexpected shape={latents.shape} expected={shape_expected}.")
+        # generator checks
+        if generator is not None:
+            if isinstance(generator, list):
+                if len(generator) != num_images * ensemble_size:
+                    raise ValueError(
+                        "The number of generators must match the total number of ensemble members for all input images."
+                    )
+                if not all(g.device.type == generator[0].device.type for g in generator):
+                    raise ValueError("`generator` device placement is not consistent in the list.")
+            elif not isinstance(generator, torch.Generator):
+                raise ValueError(f"Unsupported generator type: {type(generator)}.")
+        return num_images
+    def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+        progress_bar_config = dict(**self._progress_bar_config)
+        progress_bar_config["desc"] = progress_bar_config.get("desc", desc)
+        progress_bar_config["leave"] = progress_bar_config.get("leave", leave)
+        if iterable is not None:
+            return tqdm(iterable, **progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: Optional[int] = None,
+        ensemble_size: int = 1,
+        processing_resolution: Optional[int] = None,
+        match_input_resolution: bool = True,
+        resample_method_input: str = "bilinear",
+        resample_method_output: str = "bilinear",
+        batch_size: int = 1,
+        ensembling_kwargs: Optional[Dict[str, Any]] = None,
+        latents: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        output_type: str = "np",
+        output_uncertainty: bool = False,
+        output_latent: bool = False,
+        skip_preprocess: bool = False,
+        return_dict: bool = True,
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline.
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`),
+                `List[torch.Tensor]`: An input image or images used as an input for the normals estimation task. For
+                arrays and tensors, the expected value range is between `[0, 1]`. Passing a batch of images is possible
+                by providing a four-dimensional array or a tensor. Additionally, a list of images of two- or
+                three-dimensional arrays or tensors can be passed. In the latter case, all list elements must have the
+                same width and height.
+            num_inference_steps (`int`, *optional*, defaults to `None`):
+                Number of denoising diffusion steps during inference. The default value `None` results in automatic
+                selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
+                for Marigold-LCM models.
+            ensemble_size (`int`, defaults to `1`):
+                Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
+                faster inference.
+            processing_resolution (`int`, *optional*, defaults to `None`):
+                Effective processing resolution. When set to `0`, matches the larger input image dimension. This
+                produces crisper predictions, but may also lead to the overall loss of global context. The default
+                value `None` resolves to the optimal value from the model config.
+            match_input_resolution (`bool`, *optional*, defaults to `True`):
+                When enabled, the output prediction is resized to match the input dimensions. When disabled, the longer
+                side of the output will equal to `processing_resolution`.
+            resample_method_input (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize input images to `processing_resolution`. The accepted values are:
+                `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            resample_method_output (`str`, *optional*, defaults to `"bilinear"`):
+                Resampling method used to resize output predictions to match the input resolution. The accepted values
+                are `"nearest"`, `"nearest-exact"`, `"bilinear"`, `"bicubic"`, or `"area"`.
+            batch_size (`int`, *optional*, defaults to `1`):
+                Batch size; only matters when setting `ensemble_size` or passing a tensor of images.
+            ensembling_kwargs (`dict`, *optional*, defaults to `None`)
+                Extra dictionary with arguments for precise ensembling control. The following options are available:
+                - reduction (`str`, *optional*, defaults to `"closest"`): Defines the ensembling function applied in
+                  every pixel location, can be either `"closest"` or `"mean"`.
+            latents (`torch.Tensor`, *optional*, defaults to `None`):
+                Latent noise tensors to replace the random initialization. These can be taken from the previous
+                function call's output.
+            generator (`torch.Generator`, or `List[torch.Generator]`, *optional*, defaults to `None`):
+                Random number generator object to ensure reproducibility.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
+                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `uncertainty` field contains the predictive uncertainty map, provided that
+                the `ensemble_size` argument is set to a value above 2.
+            output_latent (`bool`, *optional*, defaults to `False`):
+                When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
+                within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
+                `latents` argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.marigold.MarigoldDepthOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.marigold.MarigoldNormalsOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.marigold.MarigoldNormalsOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is the prediction, the second element is the uncertainty
+                (or `None`), and the third is the latent (or `None`).
+        """
+        # 0. Resolving variables.
+        device = self._execution_device
+        dtype = self.dtype
+        # Model-specific optimal default values leading to fast and reasonable results.
+        if num_inference_steps is None:
+            num_inference_steps = self.default_denoising_steps
+        if processing_resolution is None:
+            processing_resolution = self.default_processing_resolution
+        # 1. Check inputs.
+        num_images = self.check_inputs(
+            image,
+            num_inference_steps,
+            ensemble_size,
+            processing_resolution,
+            resample_method_input,
+            resample_method_output,
+            batch_size,
+            ensembling_kwargs,
+            latents,
+            generator,
+            output_type,
+            output_uncertainty,
+        )
+        # 2. Prepare empty text conditioning.
+        # Model invocation: self.tokenizer, self.text_encoder.
+        if self.empty_text_embedding is None:
+            prompt = ""
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="do_not_pad",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            self.empty_text_embedding = self.text_encoder(text_input_ids)[0]  # [1,2,1024]
+        # 3. prepare prompt
+        if self.prompt_embeds is None:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                self.prompt,
+                device,
+                num_images_per_prompt,
+                False,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=None,
+                lora_scale=None,
+                clip_skip=None,
+            )
+            self.prompt_embeds = prompt_embeds
+            self.negative_prompt_embeds = negative_prompt_embeds
+        # 4. Preprocess input images. This function loads input image or images of compatible dimensions `(H, W)`,
+        # optionally downsamples them to the `processing_resolution` `(PH, PW)`, where
+        # `max(PH, PW) == processing_resolution`, and pads the dimensions to `(PPH, PPW)` such that these values are
+        # divisible by the latent space downscaling factor (typically 8 in Stable Diffusion). The default value `None`
+        # of `processing_resolution` resolves to the optimal value from the model config. It is a recommended mode of
+        # operation and leads to the most reasonable results. Using the native image resolution or any other processing
+        # resolution can lead to loss of either fine details or global context in the output predictions.
+        if not skip_preprocess:
+            image, padding, original_resolution = self.image_processor.preprocess(
+                image, processing_resolution, resample_method_input, device, dtype
+            )  # [N,3,PPH,PPW]
+        else:
+            padding = (0, 0)
+            original_resolution = image.shape[2:]
+        # 5. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
+        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
+        # Latents of each such predictions across all input images and all ensemble members are represented in the
+        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
+        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
+        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
+        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
+        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
+        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
+        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
+        # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
+            image, latents, generator, ensemble_size, batch_size
+        )  # [N*E,4,h,w], [N*E,4,h,w]
+        gaus_noise = pred_latent.detach().clone()
+        del image
+        # 6. obtain control_output
+        cond_scale =controlnet_conditioning_scale
+        down_block_res_samples, mid_block_res_sample = self.controlnet(
+            image_latent.detach(),
+            self.t_start,
+            encoder_hidden_states=self.prompt_embeds,
+            conditioning_scale=cond_scale,
+            guess_mode=False,
+            return_dict=False,
+        )
+        # 7. YOSO sampling
+        latent_x_t = self.unet(
+            pred_latent,
+            self.t_start,
+            encoder_hidden_states=self.prompt_embeds,
+            down_block_additional_residuals=down_block_res_samples,
+            mid_block_additional_residual=mid_block_res_sample,
+            return_dict=False,
+        )[0]
+        del (
+            pred_latent,
+            image_latent,
+        )
+        # decoder
+        prediction = self.decode_prediction(latent_x_t)
+        prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,3,PH,PW]
+        prediction = self.image_processor.resize_antialias(
+            prediction, original_resolution, resample_method_output, is_aa=False
+        )  # [N,3,H,W]
+        if output_type == "np":
+            prediction = self.image_processor.pt_to_numpy(prediction)  # [N,H,W,3]
+        # 11. Offload all models
+        self.maybe_free_model_hooks()
+        return YOSODiffuseOutput(
+            prediction=prediction,
+            latent=latent_x_t,
+            gaus_noise=gaus_noise,
+        )
+    # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        latents: Optional[torch.Tensor],
+        generator: Optional[torch.Generator],
+        ensemble_size: int,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        def retrieve_latents(encoder_output):
+            if hasattr(encoder_output, "latent_dist"):
+                return encoder_output.latent_dist.mode()
+            elif hasattr(encoder_output, "latents"):
+                return encoder_output.latents
+            else:
+                raise AttributeError("Could not access latents of provided encoder_output")
+        image_latent = torch.cat(
+            [
+                retrieve_latents(self.vae.encode(image[i : i + batch_size]))
+                for i in range(0, image.shape[0], batch_size)
+            ],
+            dim=0,
+        )  # [N,4,h,w]
+        image_latent = image_latent * self.vae.config.scaling_factor
+        image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
+        pred_latent = latents
+        if pred_latent is None:
+            pred_latent = randn_tensor(
+                image_latent.shape,
+                generator=generator,
+                device=image_latent.device,
+                dtype=image_latent.dtype,
+            )  # [N*E,4,h,w]
+        return image_latent, pred_latent
+    def decode_prediction(self, pred_latent: torch.Tensor) -> torch.Tensor:
+        if pred_latent.dim() != 4 or pred_latent.shape[1] != self.vae.config.latent_channels:
+            raise ValueError(
+                f"Expecting 4D tensor of shape [B,{self.vae.config.latent_channels},H,W]; got {pred_latent.shape}."
+            )
+        prediction = self.vae.decode(pred_latent / self.vae.config.scaling_factor, return_dict=False)[0]  # [B,3,H,W]
+        return prediction  # [B,3,H,W]
+    @staticmethod
+    def normalize_normals(normals: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+        norm = torch.norm(normals, dim=1, keepdim=True)
+        normals /= norm.clamp(min=eps)
+        return normals
+    @staticmethod
+    def ensemble_normals(
+        normals: torch.Tensor, output_uncertainty: bool, reduction: str = "closest"
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Ensembles the normals maps represented by the `normals` tensor with expected shape `(B, 3, H, W)`, where B is
+        the number of ensemble members for a given prediction of size `(H x W)`.
+        Args:
+            normals (`torch.Tensor`):
+                Input ensemble normals maps.
+            output_uncertainty (`bool`, *optional*, defaults to `False`):
+                Whether to output uncertainty map.
+            reduction (`str`, *optional*, defaults to `"closest"`):
+                Reduction method used to ensemble aligned predictions. The accepted values are: `"closest"` and
+                `"mean"`.
+        Returns:
+            A tensor of aligned and ensembled normals maps with shape `(1, 3, H, W)` and optionally a tensor of
+            uncertainties of shape `(1, 1, H, W)`.
+        """
+        if normals.dim() != 4 or normals.shape[1] != 3:
+            raise ValueError(f"Expecting 4D tensor of shape [B,3,H,W]; got {normals.shape}.")
+        if reduction not in ("closest", "mean"):
+            raise ValueError(f"Unrecognized reduction method: {reduction}.")
+        mean_normals = normals.mean(dim=0, keepdim=True)  # [1,3,H,W]
+        mean_normals = MarigoldNormalsPipeline.normalize_normals(mean_normals)  # [1,3,H,W]
+        sim_cos = (mean_normals * normals).sum(dim=1, keepdim=True)  # [E,1,H,W]
+        sim_cos = sim_cos.clamp(-1, 1)  # required to avoid NaN in uncertainty with fp16
+        uncertainty = None
+        if output_uncertainty:
+            uncertainty = sim_cos.arccos()  # [E,1,H,W]
+            uncertainty = uncertainty.mean(dim=0, keepdim=True) / np.pi  # [1,1,H,W]
+        if reduction == "mean":
+            return mean_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
+        closest_indices = sim_cos.argmax(dim=0, keepdim=True)  # [1,1,H,W]
+        closest_indices = closest_indices.repeat(1, 3, 1, 1)  # [1,3,H,W]
+        closest_normals = torch.gather(normals, 0, closest_indices)  # [1,3,H,W]
+        return closest_normals, uncertainty  # [1,3,H,W], [1,1,H,W]
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps