Spaces:

Stable-X
/

StableNormal

Running on Zero

App Files Files Community

Stable-X commited on Jun 12

Commit

9a4072e

•

1 Parent(s): 19b2060

Fix scheduler and preprocessor bug

Browse files

Files changed (4) hide show

app.py +12 -94
stablenormal/pipeline_stablenormal.py +129 -50
stablenormal/pipeline_yoso_normal.py +16 -6
stablenormal/scheduler/heuristics_ddimsampler.py +12 -5

app.py CHANGED Viewed

@@ -28,6 +28,7 @@ import imageio as imageio
 import numpy as np
 import spaces
 import torch as torch
 from PIL import Image
 from gradio_imageslider import ImageSlider
 from tqdm import tqdm
@@ -55,7 +56,7 @@ default_image_processing_resolution = 768
 default_video_num_inference_steps = 10
 default_video_processing_resolution = 768
-default_video_out_max_frames = 450
 def process_image_check(path_input):
     if path_input is None:
@@ -99,7 +100,6 @@ def process_image(
     path_output_dir = tempfile.mkdtemp()
     path_out_png = os.path.join(path_output_dir, f"{name_base}_normal_colored.png")
-    yield None
     input_image = Image.open(path_input)
     input_image = resize_image(input_image, default_image_processing_resolution)
@@ -132,7 +132,7 @@ def process_video(
     pipe,
     path_input,
     out_max_frames=default_video_out_max_frames,
-    target_fps=3,
     progress=gr.Progress(),
 ):
     if path_input is None:
@@ -146,6 +146,7 @@ def process_video(
     path_output_dir = tempfile.mkdtemp()
     path_out_vis = os.path.join(path_output_dir, f"{name_base}_normal_colored.mp4")
     reader, writer = None, None
     try:
         reader = imageio.get_reader(path_input)
@@ -174,8 +175,11 @@ def process_video(
             pipe_out = pipe(
                 frame_pil,
                 match_input_resolution=False,
             )
             processed_frame = pipe.image_processor.visualize_normals(  # noqa
                 pipe_out.prediction
             )[0]
@@ -333,7 +337,7 @@ def run_demo_server(pipe):
                     inputs=[video_input],
                     outputs=[processed_frames, video_output_files],
                     directory_name="examples_video",
-                    cache_examples=True,
                 )
             with gr.Tab("Panorama"):
@@ -407,108 +411,22 @@ def run_demo_server(pipe):
             server_port=7860,
         )
-from einops import rearrange
-class DINOv2_Encoder:
-    IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
-    IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
-    def __init__(
-        self,
-        model_name = 'dinov2_vitl14',
-        freeze = True,
-        antialias=True,
-        device="cuda",
-        size = 448,
-    ):
-        super(DINOv2_Encoder).__init__()
-        self.model = torch.hub.load('facebookresearch/dinov2', model_name)
-        self.model.eval()
-        self.device = device
-        self.antialias = antialias
-        self.dtype = torch.float32
-        self.mean = torch.Tensor(self.IMAGENET_DEFAULT_MEAN)
-        self.std = torch.Tensor(self.IMAGENET_DEFAULT_STD)
-        self.size = size
-        if freeze:
-            self.freeze()
-    def freeze(self):
-        for param in self.model.parameters():
-            param.requires_grad = False
-    @torch.no_grad()
-    def encoder(self, x):
-        '''
-        x: [b h w c], range from (-1, 1), rbg
-        '''
-        x = self.preprocess(x).to(self.device, self.dtype)
-        b, c, h, w = x.shape
-        patch_h, patch_w = h // 14, w // 14
-        embeddings = self.model.forward_features(x)['x_norm_patchtokens']
-        embeddings = rearrange(embeddings, 'b (h w) c -> b h w c', h = patch_h, w = patch_w)
-        return  rearrange(embeddings, 'b h w c -> b c h w')
-    def preprocess(self, x):
-        ''' x
-        '''
-        # normalize to [0,1],
-        x = torch.nn.functional.interpolate(
-            x,
-            size=(self.size, self.size),
-            mode='bicubic',
-            align_corners=True,
-            antialias=self.antialias,
-        )
-        x = (x + 1.0) / 2.0
-        # renormalize according to dino
-        mean = self.mean.view(1, 3, 1, 1).to(x.device)
-        std = self.std.view(1, 3, 1, 1).to(x.device)
-        x = (x - mean) / std
-        return x
-    def to(self, device, dtype=None):
-        if dtype is not None:
-            self.dtype = dtype
-            self.model.to(device, dtype)
-            self.mean.to(device, dtype)
-            self.std.to(device, dtype)
-        else:
-            self.model.to(device)
-            self.mean.to(device)
-            self.std.to(device)
-        return self
-    def __call__(self, x, **kwargs):
-        return self.encoder(x, **kwargs)
 def main():
     os.system("pip freeze")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     x_start_pipeline = YOSONormalsPipeline.from_pretrained(
-        'Stable-X/yoso-normal-v0-1', trust_remote_code=True,
-        t_start=300).to(device)
-    dinov2_prior = DINOv2_Encoder(size=672)
-    dinov2_prior.to(device)
-    pipe = StableNormalPipeline.from_pretrained('Stable-X/stable-normal-v0-1', t_start=300, trust_remote_code=True,
                                                 scheduler=HEURI_DDIMScheduler(prediction_type='sample',
                                                                               beta_start=0.00085, beta_end=0.0120,
                                                                               beta_schedule = "scaled_linear"))
     pipe.x_start_pipeline = x_start_pipeline
-    pipe.prior = dinov2_prior
     pipe.to(device)
     try:
         import xformers

 import numpy as np
 import spaces
 import torch as torch
+torch.backends.cuda.matmul.allow_tf32 = True
 from PIL import Image
 from gradio_imageslider import ImageSlider
 from tqdm import tqdm
 default_video_num_inference_steps = 10
 default_video_processing_resolution = 768
+default_video_out_max_frames = 60
 def process_image_check(path_input):
     if path_input is None:
     path_output_dir = tempfile.mkdtemp()
     path_out_png = os.path.join(path_output_dir, f"{name_base}_normal_colored.png")
     input_image = Image.open(path_input)
     input_image = resize_image(input_image, default_image_processing_resolution)
     pipe,
     path_input,
     out_max_frames=default_video_out_max_frames,
+    target_fps=10,
     progress=gr.Progress(),
 ):
     if path_input is None:
     path_output_dir = tempfile.mkdtemp()
     path_out_vis = os.path.join(path_output_dir, f"{name_base}_normal_colored.mp4")
+    init_latents = None
     reader, writer = None, None
     try:
         reader = imageio.get_reader(path_input)
             pipe_out = pipe(
                 frame_pil,
                 match_input_resolution=False,
+                latents=init_latents
             )
+            if init_latents is None:
+                init_latents = pipe_out.gaus_noise
             processed_frame = pipe.image_processor.visualize_normals(  # noqa
                 pipe_out.prediction
             )[0]
                     inputs=[video_input],
                     outputs=[processed_frames, video_output_files],
                     directory_name="examples_video",
+                    cache_examples=False,
                 )
             with gr.Tab("Panorama"):
             server_port=7860,
         )
 def main():
     os.system("pip freeze")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     x_start_pipeline = YOSONormalsPipeline.from_pretrained(
+        'weights/yoso-normal-v0-2', trust_remote_code=True, variant="fp16", torch_dtype=torch.float16).to(device)
+    pipe = StableNormalPipeline.from_pretrained('weights/stable-normal-v0-1', trust_remote_code=True,
+                                                variant="fp16", torch_dtype=torch.float16,
                                                 scheduler=HEURI_DDIMScheduler(prediction_type='sample',
                                                                               beta_start=0.00085, beta_end=0.0120,
                                                                               beta_schedule = "scaled_linear"))
     pipe.x_start_pipeline = x_start_pipeline
     pipe.to(device)
+    pipe.prior.to(device, torch.float16)
     try:
         import xformers

stablenormal/pipeline_stablenormal.py CHANGED Viewed

@@ -99,7 +99,90 @@ class StableNormalOutput(BaseOutput):
     prediction: Union[np.ndarray, torch.Tensor]
     latent: Union[None, torch.Tensor]
 class StableNormalPipeline(StableDiffusionControlNetPipeline):
     """ Pipeline for monocular normals estimation using the Marigold method: https://marigoldmonodepth.github.io.
@@ -163,7 +246,6 @@ class StableNormalPipeline(StableDiffusionControlNetPipeline):
         default_processing_resolution: Optional[int] = 768,
         prompt="The normal map",
         empty_text_embedding=None,
-        t_start: Optional[int] = 401,
     ):
         super().__init__(
             vae,
@@ -190,8 +272,7 @@ class StableNormalPipeline(StableDiffusionControlNetPipeline):
         self.prompt = prompt
         self.prompt_embeds = None
         self.empty_text_embedding = empty_text_embedding
-        self.t_start= torch.tensor(t_start) # target_out latents
     def check_inputs(
         self,
@@ -346,7 +427,6 @@ class StableNormalPipeline(StableDiffusionControlNetPipeline):
         num_inference_steps: Optional[int] = None,
         ensemble_size: int = 1,
         processing_resolution: Optional[int] = None,
-        return_intermediate_result: bool = False,
         match_input_resolution: bool = True,
         resample_method_input: str = "bilinear",
         resample_method_output: str = "bilinear",
@@ -441,10 +521,14 @@ class StableNormalPipeline(StableDiffusionControlNetPipeline):
             image, processing_resolution, resample_method_input, device, dtype
         )  # [N,3,PPH,PPW]
         # 0. X_start latent obtain
-        predictor = self.x_start_pipeline(image, skip_preprocess=True)
         x_start_latent = predictor.latent
-        gauss_latent = predictor.gauss_latent
         # 1. Check inputs.
         num_images = self.check_inputs(
@@ -503,28 +587,14 @@ class StableNormalPipeline(StableDiffusionControlNetPipeline):
         dino_features = self.dino_controlnet.dino_controlnet_cond_embedding(dino_features)
         dino_features = self.match_noisy(dino_features, x_start_latent)
-        # 6. Encode input image into latent space. At this step, each of the `N` input images is represented with `E`
-        # ensemble members. Each ensemble member is an independent diffused prediction, just initialized independently.
-        # Latents of each such predictions across all input images and all ensemble members are represented in the
-        # `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
-        # into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
-        # reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
-        # code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
-        # as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
-        # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
-        # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
-        # Model invocation: self.vae.encoder.
-        image_latent, pred_latent = self.prepare_latents(
-            image, latents, generator, ensemble_size, batch_size
-        )  # [N*E,4,h,w], [N*E,4,h,w]
         del (
                 image,
         )
         # 7. denoise sampling, using heuritic sampling proposed by Ye.
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
         cond_scale =controlnet_conditioning_scale
         pred_latent = x_start_latent
@@ -544,50 +614,58 @@ class StableNormalPipeline(StableDiffusionControlNetPipeline):
         pred_latents = []
-        down_block_res_samples, mid_block_res_sample = self.controlnet(
-            image_latent.detach(),
-            self.t_start,
-            encoder_hidden_states=self.prompt_embeds,
-            conditioning_scale=cond_scale,
-            guess_mode=False,
-            return_dict=False,
-        )
         last_pred_latent = pred_latent
-        for i in range(4):
             _dino_down_block_res_samples = [dino_down_block_res_sample for dino_down_block_res_sample in dino_down_block_res_samples]  # copy, avoid repeat quiery
-            model_output = self.dino_unet_forward(
                 self.unet,
                 pred_latent,
-                self.t_start,
                 encoder_hidden_states=self.prompt_embeds,
                 down_block_additional_residuals=down_block_res_samples,
                 mid_block_additional_residual=mid_block_res_sample,
                 dino_down_block_additional_residuals= _dino_down_block_res_samples,
                 return_dict=False,
             )[0]  # [B,4,h,w]
-            pred_latents.append(model_output)
-            pred_latent = self.scheduler.add_noise(model_output, gauss_latent, self.t_start)
-            pred_latent = 0.4 * pred_latent + 0.6 * last_pred_latent
-            last_pred_latent = pred_latent
-        pred_latents = torch.cat(pred_latents, dim=0)
         del (
             image_latent,
             dino_features,
         )
         # decoder
-        if return_intermediate_result:
-            prediction = []
-            for _pred_latent in pred_latents:
-                _prediction = self.decode_prediction(_pred_latent.unsqueeze(dim=0))
-                prediction.append(_prediction)
-            prediction = torch.cat(prediction, dim=0)
-        else:
-            prediction = self.decode_prediction(pred_latents[-1].unsqueeze(dim=0))
         prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,3,PH,PW]
         if match_input_resolution:
             prediction = self.image_processor.resize_antialias(
                 prediction, original_resolution, resample_method_output, is_aa=False
@@ -604,6 +682,7 @@ class StableNormalPipeline(StableDiffusionControlNetPipeline):
         return StableNormalOutput(
             prediction=prediction,
             latent=pred_latent,
         )
     # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents

     prediction: Union[np.ndarray, torch.Tensor]
     latent: Union[None, torch.Tensor]
+    gaus_noise: Union[None, torch.Tensor]
+from einops import rearrange
+class DINOv2_Encoder(torch.nn.Module):
+    IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
+    IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+    def __init__(
+        self,
+        model_name = 'dinov2_vitl14',
+        freeze = True,
+        antialias=True,
+        device="cuda",
+        size = 448,
+    ):
+        super(DINOv2_Encoder, self).__init__()
+        self.model = torch.hub.load('facebookresearch/dinov2', model_name)
+        self.model.eval().to(device)
+        self.device = device
+        self.antialias = antialias
+        self.dtype = torch.float32
+        self.mean = torch.Tensor(self.IMAGENET_DEFAULT_MEAN)
+        self.std = torch.Tensor(self.IMAGENET_DEFAULT_STD)
+        self.size = size
+        if freeze:
+            self.freeze()
+    def freeze(self):
+        for param in self.model.parameters():
+            param.requires_grad = False
+    @torch.no_grad()
+    def encoder(self, x):
+        '''
+        x: [b h w c], range from (-1, 1), rbg
+        '''
+        x = self.preprocess(x).to(self.device, self.dtype)
+        b, c, h, w = x.shape
+        patch_h, patch_w = h // 14, w // 14
+        embeddings = self.model.forward_features(x)['x_norm_patchtokens']
+        embeddings = rearrange(embeddings, 'b (h w) c -> b h w c', h = patch_h, w = patch_w)
+        return  rearrange(embeddings, 'b h w c -> b c h w')
+    def preprocess(self, x):
+        ''' x
+        '''
+        # normalize to [0,1],
+        x = torch.nn.functional.interpolate(
+            x,
+            size=(self.size, self.size),
+            mode='bicubic',
+            align_corners=True,
+            antialias=self.antialias,
+        )
+        x = (x + 1.0) / 2.0
+        # renormalize according to dino
+        mean = self.mean.view(1, 3, 1, 1).to(x.device)
+        std = self.std.view(1, 3, 1, 1).to(x.device)
+        x = (x - mean) / std
+        return x
+    def to(self, device, dtype=None):
+        if dtype is not None:
+            self.dtype = dtype
+            self.model.to(device, dtype)
+            self.mean.to(device, dtype)
+            self.std.to(device, dtype)
+        else:
+            self.model.to(device)
+            self.mean.to(device)
+            self.std.to(device)
+        return self
+    def __call__(self, x, **kwargs):
+        return self.encoder(x, **kwargs)
 class StableNormalPipeline(StableDiffusionControlNetPipeline):
     """ Pipeline for monocular normals estimation using the Marigold method: https://marigoldmonodepth.github.io.
         default_processing_resolution: Optional[int] = 768,
         prompt="The normal map",
         empty_text_embedding=None,
     ):
         super().__init__(
             vae,
         self.prompt = prompt
         self.prompt_embeds = None
         self.empty_text_embedding = empty_text_embedding
+        self.prior = DINOv2_Encoder(size=672)
     def check_inputs(
         self,
         num_inference_steps: Optional[int] = None,
         ensemble_size: int = 1,
         processing_resolution: Optional[int] = None,
         match_input_resolution: bool = True,
         resample_method_input: str = "bilinear",
         resample_method_output: str = "bilinear",
             image, processing_resolution, resample_method_input, device, dtype
         )  # [N,3,PPH,PPW]
+        image_latent, gaus_noise = self.prepare_latents(
+            image, latents, generator, ensemble_size, batch_size
+        )  # [N,4,h,w], [N,4,h,w]
         # 0. X_start latent obtain
+        predictor = self.x_start_pipeline(image, latents=gaus_noise,
+                                          processing_resolution=processing_resolution, skip_preprocess=True)
         x_start_latent = predictor.latent
         # 1. Check inputs.
         num_images = self.check_inputs(
         dino_features = self.dino_controlnet.dino_controlnet_cond_embedding(dino_features)
         dino_features = self.match_noisy(dino_features, x_start_latent)
         del (
                 image,
         )
         # 7. denoise sampling, using heuritic sampling proposed by Ye.
+        t_start = self.x_start_pipeline.t_start
+        self.scheduler.set_timesteps(num_inference_steps, t_start=t_start,device=device)
         cond_scale =controlnet_conditioning_scale
         pred_latent = x_start_latent
         pred_latents = []
         last_pred_latent = pred_latent
+        for (t, prev_t) in self.progress_bar(zip(self.scheduler.timesteps,self.scheduler.prev_timesteps), leave=False, desc="Diffusion steps..."):
             _dino_down_block_res_samples = [dino_down_block_res_sample for dino_down_block_res_sample in dino_down_block_res_samples]  # copy, avoid repeat quiery
+            # controlnet
+            down_block_res_samples, mid_block_res_sample = self.controlnet(
+                image_latent.detach(),
+                t,
+                encoder_hidden_states=self.prompt_embeds,
+                conditioning_scale=cond_scale,
+                guess_mode=False,
+                return_dict=False,
+            )
+            # SG-DRN
+            noise = self.dino_unet_forward(
                 self.unet,
                 pred_latent,
+                t,
                 encoder_hidden_states=self.prompt_embeds,
                 down_block_additional_residuals=down_block_res_samples,
                 mid_block_additional_residual=mid_block_res_sample,
                 dino_down_block_additional_residuals= _dino_down_block_res_samples,
                 return_dict=False,
             )[0]  # [B,4,h,w]
+            pred_latents.append(noise)
+            # ddim steps
+            out = self.scheduler.step(
+                noise, t, prev_t, pred_latent, gaus_noise = gaus_noise, generator=generator, cur_step=cur_step+1  # NOTE that cur_step dirs to next_step
+            )# [B,4,h,w]
+            pred_latent = out.prev_sample
+            cur_step += 1
         del (
             image_latent,
             dino_features,
         )
+        pred_latent = pred_latents[-1]  # using x0
         # decoder
+        prediction = self.decode_prediction(pred_latent)
         prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,3,PH,PW]
+        prediction = self.image_processor.resize_antialias(prediction, original_resolution, resample_method_output, is_aa=False)  # [N,3,H,W]
+        if match_input_resolution:
+            prediction = self.image_processor.resize_antialias(
+                prediction, original_resolution, resample_method_output, is_aa=False
+            )  # [N,3,H,W]
         if match_input_resolution:
             prediction = self.image_processor.resize_antialias(
                 prediction, original_resolution, resample_method_output, is_aa=False
         return StableNormalOutput(
             prediction=prediction,
             latent=pred_latent,
+            gaus_noise=gaus_noise
         )
     # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents

stablenormal/pipeline_yoso_normal.py CHANGED Viewed

@@ -93,7 +93,7 @@ class YosoNormalsOutput(BaseOutput):
     prediction: Union[np.ndarray, torch.Tensor]
     latent: Union[None, torch.Tensor]
-    gauss_latent: Union[None, torch.Tensor]
 class YOSONormalsPipeline(StableDiffusionControlNetPipeline):
@@ -502,10 +502,11 @@ class YOSONormalsPipeline(StableDiffusionControlNetPipeline):
         # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
         # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
         # Model invocation: self.vae.encoder.
-        image_latent, gauss_latent = self.prepare_latents(
             image, latents, generator, ensemble_size, batch_size
         )  # [N*E,4,h,w], [N*E,4,h,w]
         del image
@@ -523,7 +524,7 @@ class YOSONormalsPipeline(StableDiffusionControlNetPipeline):
         # 7. YOSO sampling
         latent_x_t = self.unet(
-            gauss_latent,
             self.t_start,
             encoder_hidden_states=self.prompt_embeds,
             down_block_additional_residuals=down_block_res_samples,
@@ -533,6 +534,7 @@ class YOSONormalsPipeline(StableDiffusionControlNetPipeline):
         del (
             image_latent,
         )
@@ -554,7 +556,7 @@ class YOSONormalsPipeline(StableDiffusionControlNetPipeline):
         return YosoNormalsOutput(
             prediction=prediction,
             latent=latent_x_t,
-            gauss_latent=gauss_latent,
         )
     # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents
@@ -585,7 +587,15 @@ class YOSONormalsPipeline(StableDiffusionControlNetPipeline):
         )  # [N,4,h,w]
         image_latent = image_latent * self.vae.config.scaling_factor
         image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
-        pred_latent = torch.randn_like(image_latent)
         return image_latent, pred_latent
@@ -714,4 +724,4 @@ def retrieve_timesteps(
     else:
         scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
         timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps

     prediction: Union[np.ndarray, torch.Tensor]
     latent: Union[None, torch.Tensor]
+    gaus_noise: Union[None, torch.Tensor]
 class YOSONormalsPipeline(StableDiffusionControlNetPipeline):
         # noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
         # dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
         # Model invocation: self.vae.encoder.
+        image_latent, pred_latent = self.prepare_latents(
             image, latents, generator, ensemble_size, batch_size
         )  # [N*E,4,h,w], [N*E,4,h,w]
+        gaus_noise = pred_latent.detach().clone()
         del image
         # 7. YOSO sampling
         latent_x_t = self.unet(
+            pred_latent,
             self.t_start,
             encoder_hidden_states=self.prompt_embeds,
             down_block_additional_residuals=down_block_res_samples,
         del (
+            pred_latent,
             image_latent,
         )
         return YosoNormalsOutput(
             prediction=prediction,
             latent=latent_x_t,
+            gaus_noise=gaus_noise,
         )
     # Copied from diffusers.pipelines.marigold.pipeline_marigold_depth.MarigoldDepthPipeline.prepare_latents
         )  # [N,4,h,w]
         image_latent = image_latent * self.vae.config.scaling_factor
         image_latent = image_latent.repeat_interleave(ensemble_size, dim=0)  # [N*E,4,h,w]
+        pred_latent = latents
+        if pred_latent is None:
+            pred_latent = randn_tensor(
+                image_latent.shape,
+                generator=generator,
+                device=image_latent.device,
+                dtype=image_latent.dtype,
+            )  # [N*E,4,h,w]
         return image_latent, pred_latent
     else:
         scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
         timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps

stablenormal/scheduler/heuristics_ddimsampler.py CHANGED Viewed

@@ -12,7 +12,7 @@ import pdb
 class HEURI_DDIMScheduler(DDIMScheduler, SchedulerMixin, ConfigMixin):
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
             """
             Sets the discrete timesteps used for the diffusion chain (to be run before inference).
@@ -56,8 +56,13 @@ class HEURI_DDIMScheduler(DDIMScheduler, SchedulerMixin, ConfigMixin):
                 )
             timesteps = torch.from_numpy(timesteps).to(device)
             naive_sampling_step = num_inference_steps //2
             self.naive_sampling_step = naive_sampling_step
             timesteps[:naive_sampling_step] = timesteps[naive_sampling_step] # refine on step 5 for 5 steps, then backward from step 6
@@ -79,8 +84,8 @@ class HEURI_DDIMScheduler(DDIMScheduler, SchedulerMixin, ConfigMixin):
             use_clipped_model_output: bool = False,
             generator=None,
             cur_step=None,
-            gauss_latent=None,
             variance_noise: Optional[torch.Tensor] = None,
             return_dict: bool = True,
         ) -> Union[DDIMSchedulerOutput, Tuple]:
             """
@@ -134,10 +139,12 @@ class HEURI_DDIMScheduler(DDIMScheduler, SchedulerMixin, ConfigMixin):
             # - pred_prev_sample -> "x_t-1"
             # 1. get previous step value (=t-1)
             # trick from heuri_sampling
             if cur_step == self.naive_sampling_step  and timestep == prev_timestep:
                 timestep += self.gap
             prev_timestep = prev_timestep  # NOTE naive sampling
             # 2. compute alphas, betas
@@ -172,6 +179,7 @@ class HEURI_DDIMScheduler(DDIMScheduler, SchedulerMixin, ConfigMixin):
             variance = self._get_variance(timestep, prev_timestep)
             std_dev_t = eta * variance ** (0.5)
             if use_clipped_model_output:
                 # the pred_epsilon is always re-derived from the clipped x_0 in Glide
                 pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
@@ -180,8 +188,6 @@ class HEURI_DDIMScheduler(DDIMScheduler, SchedulerMixin, ConfigMixin):
             pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
             # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-            if gauss_latent == None:
-                gauss_latent = torch.randn_like(pred_original_sample)
             prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
             if eta > 0:
@@ -200,11 +206,12 @@ class HEURI_DDIMScheduler(DDIMScheduler, SchedulerMixin, ConfigMixin):
                 prev_sample = prev_sample + variance
             if cur_step < self.naive_sampling_step:
-                prev_sample = self.add_noise(pred_original_sample, gauss_latent, timestep)
             if not return_dict:
                 return (prev_sample,)
             return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)

 class HEURI_DDIMScheduler(DDIMScheduler, SchedulerMixin, ConfigMixin):
+    def set_timesteps(self, num_inference_steps: int, t_start: int, device: Union[str, torch.device] = None):
             """
             Sets the discrete timesteps used for the diffusion chain (to be run before inference).
                 )
             timesteps = torch.from_numpy(timesteps).to(device)
             naive_sampling_step = num_inference_steps //2
+            # TODO for debug
+            # naive_sampling_step = 0
             self.naive_sampling_step = naive_sampling_step
             timesteps[:naive_sampling_step] = timesteps[naive_sampling_step] # refine on step 5 for 5 steps, then backward from step 6
             use_clipped_model_output: bool = False,
             generator=None,
             cur_step=None,
             variance_noise: Optional[torch.Tensor] = None,
+            gaus_noise: Optional[torch.Tensor] = None,
             return_dict: bool = True,
         ) -> Union[DDIMSchedulerOutput, Tuple]:
             """
             # - pred_prev_sample -> "x_t-1"
             # 1. get previous step value (=t-1)
             # trick from heuri_sampling
             if cur_step == self.naive_sampling_step  and timestep == prev_timestep:
                 timestep += self.gap
             prev_timestep = prev_timestep  # NOTE naive sampling
             # 2. compute alphas, betas
             variance = self._get_variance(timestep, prev_timestep)
             std_dev_t = eta * variance ** (0.5)
             if use_clipped_model_output:
                 # the pred_epsilon is always re-derived from the clipped x_0 in Glide
                 pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
             pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
             # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
             prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
             if eta > 0:
                 prev_sample = prev_sample + variance
             if cur_step < self.naive_sampling_step:
+                prev_sample = self.add_noise(pred_original_sample, torch.randn_like(pred_original_sample), timestep)
             if not return_dict:
                 return (prev_sample,)
             return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)