BiliSakura commited on 7 days ago

Commit

f0eba3b

verified ·

1 Parent(s): 94572ed

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

.gitattributes +1 -0
README.md +146 -0
__pycache__/pipeline.cpython-312.pyc +0 -0
custom_pipeline/__init__.py +30 -0
custom_pipeline/__pycache__/__init__.cpython-312.pyc +0 -0
custom_pipeline/__pycache__/pipeline_nit.cpython-312.pyc +0 -0
custom_pipeline/__pycache__/scheduling_flow_match_nit.cpython-312.pyc +0 -0
custom_pipeline/__pycache__/transformer_nit.cpython-312.pyc +0 -0
custom_pipeline/pipeline_nit.py +237 -0
custom_pipeline/scheduling_flow_match_nit.py +187 -0
custom_pipeline/transformer_nit.py +471 -0
demo_images/demo_sde250_class207_seed42.png +3 -0
model_index.json +16 -0
pipeline.py +21 -0
scheduler/scheduler_config.json +6 -0
test_inference.py +96 -0
transformer/config.json +14 -0
transformer/diffusion_pytorch_model.safetensors +3 -0
vae/config.json +88 -0
vae/diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo_images/demo_sde250_class207_seed42.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,146 @@

+---
+license: apache-2.0
+language:
+  - en
+library_name: diffusers
+tags:
+  - diffusers
+  - image-generation
+  - class-conditional
+  - nit
+pipeline_tag: unconditional-image-generation
+widget:
+  - src: demo_images/demo_sde250_class207_seed42.png
+    example_title: NiT-XL Class 207
+---
+# NiT-XL Diffusers (Class-Conditional)
+Native-resolution Image Transformer (NiT-XL) checkpoint packaged as a Diffusers-style repository with vendored custom code.
+## What is included
+- `transformer/`: `NiTTransformer2DModel` weights + config
+- `scheduler/`: `NiTFlowMatchScheduler` config
+- `vae/`: `AutoencoderDC` weights + config
+- `custom_pipeline/`: local, self-contained implementation for:
+  - `NiTPipeline`
+  - `NiTTransformer2DModel`
+  - `NiTFlowMatchScheduler`
+- `test_inference.py`: standalone sampling script
+This repository does **not** depend on an external `NiT-diffusers` checkout during inference.
+It includes a root `pipeline.py` custom entrypoint for Diffusers dynamic loading.
+## Quickstart
+### 1) Environment
+Install dependencies (example):
+```bash
+pip install torch diffusers safetensors
+```
+If using this project environment:
+```bash
+conda activate rsgen
+```
+### 2) Generate a demo image
+Run from this repository root:
+```bash
+python test_inference.py \
+  --class-label 207 \
+  --height 512 \
+  --width 512 \
+  --steps 250 \
+  --mode sde \
+  --guidance-scale 2.05 \
+  --guidance-low 0.0 \
+  --guidance-high 0.7 \
+  --output demo_images/demo_sde250_class207_seed42.png
+```
+## Python usage
+```python
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+model_dir = Path(".").resolve()
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.bfloat16 if device == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
+pipe = DiffusionPipeline.from_pretrained(
+    model_dir,
+    custom_pipeline=str(model_dir / "pipeline.py"),
+    local_files_only=True,
+).to(device)
+if device == "cuda":
+    pipe.transformer.to(dtype=dtype)
+    pipe.vae.to(dtype=dtype)
+gen = torch.Generator(device=device).manual_seed(42)
+result = pipe(
+    class_labels=[207],
+    height=512,
+    width=512,
+    num_inference_steps=250,
+    mode="sde",
+    guidance_scale=2.05,
+    guidance_interval=(0.0, 0.7),
+    generator=gen,
+)
+result.images[0].save("demo_images/sample.png")
+```
+For remote Hub loading:
+```python
+from diffusers import DiffusionPipeline
+pipe = DiffusionPipeline.from_pretrained(
+    "BiliSakura/NiT-XL-diffusers",
+    custom_pipeline="pipeline",
+)
+```
+## Recommended inference settings
+- Resolution: `512x512`
+- Mode: `sde`
+- Steps: `250`
+- Guidance scale: `2.05`
+- Guidance interval: `(0.0, 0.7)`
+Using very low steps (for example `2`) is only a smoke test and will produce low-quality images.
+## Demo
+![NiT-XL demo image](demo_images/demo_sde250_class207_seed42.png)
+## Citation
+If you use this model or the NiT method in your work, please cite:
+```bibtex
+@article{wang2025native,
+  title={Native-Resolution Image Synthesis},
+  author={Wang, Zidong and Bai, Lei and Yue, Xiangyu and Ouyang, Wanli and Zhang, Yiyuan},
+  year={2025},
+  eprint={2506.03131},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV}
+}
+```
+## Notes
+- This is a class-conditional generator (ImageNet label ids), not a text-to-image model.
+- For reproducibility, set `--seed`.
+- The vendored custom pipeline keeps inference behavior consistent without external code dependencies.

__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (728 Bytes). View file

custom_pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from .pipeline_nit import NiTPipeline, NiTPipelineOutput
+from .transformer_nit import NiTTransformer2DModel, NiTTransformer2DModelOutput
+from .scheduling_flow_match_nit import NiTFlowMatchScheduler, NiTFlowMatchSchedulerOutput
+def _register_with_diffusers():
+    """
+    Expose NiT classes on the `diffusers` namespace so pipeline/component loading
+    via `from_pretrained()` can resolve entries declared in model_index.json.
+    """
+    try:
+        import diffusers
+    except Exception:
+        return
+    setattr(diffusers, "NiTPipeline", NiTPipeline)
+    setattr(diffusers, "NiTTransformer2DModel", NiTTransformer2DModel)
+    setattr(diffusers, "NiTFlowMatchScheduler", NiTFlowMatchScheduler)
+_register_with_diffusers()
+__all__ = [
+    "NiTPipeline",
+    "NiTPipelineOutput",
+    "NiTTransformer2DModel",
+    "NiTTransformer2DModelOutput",
+    "NiTFlowMatchScheduler",
+    "NiTFlowMatchSchedulerOutput",
+]

custom_pipeline/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.11 kB). View file

custom_pipeline/__pycache__/pipeline_nit.cpython-312.pyc ADDED Viewed

Binary file (12.3 kB). View file

custom_pipeline/__pycache__/scheduling_flow_match_nit.cpython-312.pyc ADDED Viewed

Binary file (11.3 kB). View file

custom_pipeline/__pycache__/transformer_nit.cpython-312.pyc ADDED Viewed

Binary file (31.4 kB). View file

custom_pipeline/pipeline_nit.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+try:
+    from diffusers.image_processor import VaeImageProcessor
+    from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+    from diffusers.utils import BaseOutput
+except Exception:  # pragma: no cover - importable without a full diffusers install.
+    class BaseOutput(dict):
+        def __post_init__(self):
+            self.update(self.__dict__)
+    class DiffusionPipeline:
+        def register_modules(self, **kwargs):
+            for name, module in kwargs.items():
+                setattr(self, name, module)
+        @property
+        def _execution_device(self):
+            return torch.device("cpu")
+        def maybe_free_model_hooks(self):
+            pass
+    class VaeImageProcessor:
+        def postprocess(self, image, output_type="pil"):
+            return image
+@dataclass
+class NiTPipelineOutput(BaseOutput):
+    images: Union[torch.FloatTensor, List]
+class NiTPipeline(DiffusionPipeline):
+    r"""
+    Native-resolution Image Synthesis pipeline using a class-conditional NiT transformer.
+    This pipeline follows Diffusers conventions: transformer, scheduler, and VAE are
+    saved as separate subfolders and restored with `DiffusionPipeline.from_pretrained`.
+    The transformer predicts flow-matching velocity in latent space.
+    """
+    model_cpu_offload_seq = "transformer->vae"
+    _optional_components = ["vae"]
+    def __init__(self, transformer, scheduler, vae=None):
+        super().__init__()
+        self.register_modules(transformer=transformer, scheduler=scheduler, vae=vae)
+        self.image_processor = VaeImageProcessor()
+    def _prepare_latents(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+    ) -> Tuple[torch.Tensor, torch.LongTensor]:
+        if self.vae is None:
+            spatial_downsample = 1
+        elif self.vae.__class__.__name__ == "AutoencoderDC" or "dc-ae" in getattr(self.vae.config, "_name_or_path", ""):
+            spatial_downsample = 32
+        else:
+            spatial_downsample = getattr(self.vae.config, "block_out_channels", [0, 0, 0, 0])
+            spatial_downsample = 2 ** (len(spatial_downsample) - 1)
+        if height % spatial_downsample != 0 or width % spatial_downsample != 0:
+            raise ValueError(f"height and width must be divisible by the VAE downsample factor {spatial_downsample}.")
+        latent_height = height // spatial_downsample
+        latent_width = width // spatial_downsample
+        patch_size = int(self.transformer.config.patch_size)
+        if latent_height % patch_size != 0 or latent_width % patch_size != 0:
+            raise ValueError("Latent height and width must be divisible by transformer's patch_size.")
+        token_height = latent_height // patch_size
+        token_width = latent_width // patch_size
+        image_sizes = torch.tensor([[token_height, token_width]] * batch_size, device=device, dtype=torch.long)
+        # Match native NiT sampler initialization exactly: sample directly in packed-token space.
+        packed_shape = (
+            batch_size * token_height * token_width,
+            self.transformer.config.in_channels,
+            patch_size,
+            patch_size,
+        )
+        packed_latents = torch.randn(packed_shape, generator=generator, device=device, dtype=dtype)
+        return packed_latents, image_sizes
+    def _apply_classifier_free_guidance(
+        self,
+        model_output: torch.Tensor,
+        guidance_scale: float,
+        guidance_active: bool,
+    ) -> torch.Tensor:
+        if guidance_scale <= 1.0 or not guidance_active:
+            return model_output
+        model_output_cond, model_output_uncond = model_output.chunk(2)
+        return model_output_uncond + guidance_scale * (model_output_cond - model_output_uncond)
+    def _get_vae_dtype(self, latents: torch.Tensor) -> torch.dtype:
+        vae_dtype = getattr(self.vae, "dtype", None)
+        if vae_dtype is not None:
+            return vae_dtype
+        vae_params = next(self.vae.parameters(), None)
+        return vae_params.dtype if vae_params is not None else latents.dtype
+    def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        if self.vae is None:
+            return latents
+        vae_dtype = self._get_vae_dtype(latents)
+        latents = latents.to(dtype=vae_dtype)
+        scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
+        latents = latents / scaling_factor
+        if self.vae.__class__.__name__ == "AutoencoderDC":
+            image = self.vae._decode(latents)
+        else:
+            image = self.vae.decode(latents)
+            image = image.sample if hasattr(image, "sample") else image
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        class_labels: Union[int, List[int], torch.LongTensor],
+        height: int = 256,
+        width: int = 256,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1.0,
+        guidance_interval: Tuple[float, float] = (0.0, 1.0),
+        mode: str = "ode",
+        heun: bool = False,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[NiTPipelineOutput, Tuple]:
+        device = self._execution_device
+        model_dtype = next(self.transformer.parameters()).dtype
+        if isinstance(class_labels, int):
+            class_labels = [class_labels]
+        if not torch.is_tensor(class_labels):
+            class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
+        else:
+            class_labels = class_labels.to(device=device, dtype=torch.long)
+        batch_size = class_labels.numel()
+        packed_latents, image_sizes = self._prepare_latents(batch_size, height, width, model_dtype, device, generator)
+        packed_latents = packed_latents.to(dtype=torch.float64)
+        timesteps = self.scheduler.set_timesteps(num_inference_steps, device=device, mode=mode)
+        null_labels = torch.full_like(class_labels, self.transformer.config.num_classes)
+        for index, timestep in enumerate(timesteps[:-1]):
+            next_timestep = timesteps[index + 1]
+            guidance_active = guidance_interval[0] <= float(timestep) <= guidance_interval[1]
+            if guidance_scale > 1.0 and guidance_active:
+                model_input = torch.cat([packed_latents, packed_latents], dim=0)
+                labels = torch.cat([class_labels, null_labels], dim=0)
+                model_image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
+            else:
+                model_input = packed_latents
+                labels = class_labels
+                model_image_sizes = image_sizes
+            timestep_batch = torch.full((labels.numel(),), float(timestep), device=device, dtype=model_dtype)
+            model_output = self.transformer(
+                model_input.to(dtype=model_dtype),
+                timestep_batch,
+                labels,
+                image_sizes=model_image_sizes,
+                return_dict=True,
+            ).sample
+            model_output = self._apply_classifier_free_guidance(model_output, guidance_scale, guidance_active)
+            if heun and mode == "ode" and index < len(timesteps) - 2:
+                provisional = self.scheduler.step(
+                    model_output,
+                    timestep[None],
+                    packed_latents,
+                    next_timestep[None],
+                    image_sizes=image_sizes,
+                ).prev_sample
+                if guidance_scale > 1.0 and guidance_active:
+                    prime_input = torch.cat([provisional, provisional], dim=0)
+                    labels = torch.cat([class_labels, null_labels], dim=0)
+                    model_image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
+                else:
+                    prime_input = provisional
+                    labels = class_labels
+                    model_image_sizes = image_sizes
+                next_timestep_batch = torch.full((labels.numel(),), float(next_timestep), device=device, dtype=model_dtype)
+                next_model_output = self.transformer(
+                    prime_input.to(dtype=model_dtype),
+                    next_timestep_batch,
+                    labels,
+                    image_sizes=model_image_sizes,
+                    return_dict=True,
+                ).sample
+                next_model_output = self._apply_classifier_free_guidance(
+                    next_model_output, guidance_scale, guidance_active
+                )
+                packed_latents = self.scheduler.step_heun(
+                    model_output, next_model_output, timestep[None], packed_latents, next_timestep[None]
+                ).prev_sample
+            else:
+                packed_latents = self.scheduler.step(
+                    model_output,
+                    timestep[None],
+                    packed_latents,
+                    next_timestep[None],
+                    image_sizes=image_sizes,
+                    generator=generator,
+                ).prev_sample
+        latents = self.transformer._unpack_latents(packed_latents, image_sizes)
+        image = self._decode_latents(latents)
+        if self.vae is not None:
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = self.image_processor.postprocess(
+                image,
+                output_type=output_type,
+                do_denormalize=[False] * image.shape[0],
+            )
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return NiTPipelineOutput(images=image)

custom_pipeline/scheduling_flow_match_nit.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import numpy as np
+import torch
+try:
+    from diffusers.configuration_utils import ConfigMixin, register_to_config
+    from diffusers.schedulers.scheduling_utils import SchedulerMixin
+    from diffusers.utils import BaseOutput
+except Exception:  # pragma: no cover - importable without an installed diffusers checkout.
+    class BaseOutput(dict):
+        def __post_init__(self):
+            self.update(self.__dict__)
+    class ConfigMixin:
+        config_name = "scheduler_config.json"
+    class SchedulerMixin:
+        pass
+    def register_to_config(init):
+        return init
+@dataclass
+class NiTFlowMatchSchedulerOutput(BaseOutput):
+    prev_sample: torch.FloatTensor
+class NiTFlowMatchScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Flow-matching ODE/SDE scheduler used by Native-resolution Image Synthesis (NiT).
+    The model predicts velocity with a linear path by default. Timesteps run from 1 to 0,
+    matching the original sampler while exposing the standard Diffusers `set_timesteps`
+    and `step` API.
+    """
+    config_name = "scheduler_config.json"
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        mode: str = "ode",
+        path_type: str = "linear",
+        num_train_timesteps: int = 1000,
+    ):
+        if mode not in {"ode", "sde"}:
+            raise ValueError("mode must be either 'ode' or 'sde'.")
+        if path_type not in {"linear", "cosine"}:
+            raise ValueError("path_type must be either 'linear' or 'cosine'.")
+        self.mode = mode
+        self.path_type = path_type
+        self.num_train_timesteps = num_train_timesteps
+        # Native NiT integrates in float64 for better numerical stability.
+        self.timesteps = torch.from_numpy(np.linspace(1.0, 0.0, num_train_timesteps + 1)).to(dtype=torch.float64)
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Optional[torch.device] = None,
+        mode: Optional[str] = None,
+    ):
+        mode = mode or self.mode
+        dtype = self.timesteps.dtype
+        if mode == "sde":
+            timesteps = torch.linspace(1.0, 0.04, num_inference_steps, dtype=dtype)
+            timesteps = torch.cat([timesteps, torch.zeros(1, dtype=dtype)])
+        elif mode == "ode":
+            timesteps = torch.linspace(1.0, 0.0, num_inference_steps + 1, dtype=dtype)
+        else:
+            raise ValueError("mode must be either 'ode' or 'sde'.")
+        self.mode = mode
+        self.timesteps = timesteps.to(device=device)
+        return self.timesteps
+    @staticmethod
+    def _expand_t_like_sample(timestep: torch.Tensor, sample: torch.Tensor, image_sizes: torch.LongTensor):
+        dims = [1] * (sample.ndim - 1)
+        seqlens = image_sizes[:, 0] * image_sizes[:, 1]
+        if timestep.numel() == 1:
+            timestep = timestep.repeat(image_sizes.shape[0])
+        return torch.cat(
+            [timestep[i].reshape(1, *dims).repeat(int(seqlens[i]), *dims) for i in range(image_sizes.shape[0])]
+        )
+    def _get_score_from_velocity(
+        self,
+        model_output: torch.Tensor,
+        sample: torch.Tensor,
+        timestep: torch.Tensor,
+        image_sizes: torch.LongTensor,
+    ):
+        timestep = self._expand_t_like_sample(timestep, sample, image_sizes)
+        if self.path_type == "linear":
+            alpha_t, d_alpha_t = 1 - timestep, torch.ones_like(timestep) * -1
+            sigma_t, d_sigma_t = timestep, torch.ones_like(timestep)
+        elif self.path_type == "cosine":
+            alpha_t = torch.cos(timestep * np.pi / 2)
+            sigma_t = torch.sin(timestep * np.pi / 2)
+            d_alpha_t = -np.pi / 2 * torch.sin(timestep * np.pi / 2)
+            d_sigma_t = np.pi / 2 * torch.cos(timestep * np.pi / 2)
+        else:
+            raise ValueError(f"Unsupported path_type: {self.path_type}")
+        reverse_alpha_ratio = alpha_t / d_alpha_t
+        variance = sigma_t**2 - reverse_alpha_ratio * d_sigma_t * sigma_t
+        return (reverse_alpha_ratio * model_output - sample) / variance
+    @staticmethod
+    def _compute_diffusion(timestep: torch.Tensor):
+        return 2 * timestep
+    @staticmethod
+    def _promote_dtypes(*tensors: torch.Tensor) -> torch.dtype:
+        dtype = None
+        for tensor in tensors:
+            if tensor.is_floating_point() or tensor.is_complex():
+                dtype = tensor.dtype if dtype is None else torch.promote_types(dtype, tensor.dtype)
+        return dtype if dtype is not None else torch.get_default_dtype()
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: torch.Tensor,
+        sample: torch.Tensor,
+        next_timestep: torch.Tensor,
+        image_sizes: Optional[torch.LongTensor] = None,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> NiTFlowMatchSchedulerOutput:
+        compute_dtype = torch.float64
+        sample = sample.to(dtype=compute_dtype)
+        model_output = model_output.to(dtype=compute_dtype)
+        timestep = timestep.to(device=sample.device, dtype=compute_dtype).flatten()
+        next_timestep = next_timestep.to(device=sample.device, dtype=compute_dtype).flatten()
+        if self.mode == "ode":
+            prev_sample = sample + (next_timestep[0] - timestep[0]) * model_output
+        else:
+            if image_sizes is None:
+                raise ValueError("image_sizes are required for SDE sampling.")
+            image_sizes = image_sizes.to(device=sample.device, dtype=torch.long)
+            diffusion = self._compute_diffusion(timestep[0])
+            score = self._get_score_from_velocity(model_output, sample, timestep, image_sizes)
+            drift = model_output - 0.5 * diffusion * score
+            dt = next_timestep[0] - timestep[0]
+            if torch.allclose(next_timestep[0], torch.zeros_like(next_timestep[0])):
+                prev_sample = sample + drift * dt
+            else:
+                if generator is not None:
+                    noise = torch.randn(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
+                else:
+                    noise = torch.randn_like(sample)
+                prev_sample = sample + drift * dt + torch.sqrt(diffusion) * noise * torch.sqrt(torch.abs(dt))
+        if not return_dict:
+            return (prev_sample,)
+        return NiTFlowMatchSchedulerOutput(prev_sample=prev_sample)
+    def step_heun(
+        self,
+        model_output: torch.Tensor,
+        next_model_output: torch.Tensor,
+        timestep: torch.Tensor,
+        sample: torch.Tensor,
+        next_timestep: torch.Tensor,
+        return_dict: bool = True,
+    ) -> NiTFlowMatchSchedulerOutput:
+        if self.mode != "ode":
+            raise ValueError("Heun correction is only defined for ODE sampling.")
+        compute_dtype = torch.float64
+        sample = sample.to(dtype=compute_dtype)
+        model_output = model_output.to(dtype=compute_dtype)
+        next_model_output = next_model_output.to(dtype=compute_dtype)
+        timestep = timestep.to(device=sample.device, dtype=compute_dtype).flatten()
+        next_timestep = next_timestep.to(device=sample.device, dtype=compute_dtype).flatten()
+        prev_sample = sample + (next_timestep[0] - timestep[0]) * (0.5 * model_output + 0.5 * next_model_output)
+        if not return_dict:
+            return (prev_sample,)
+        return NiTFlowMatchSchedulerOutput(prev_sample=prev_sample)

custom_pipeline/transformer_nit.py ADDED Viewed

	@@ -0,0 +1,471 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+from dataclasses import dataclass
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from diffusers.configuration_utils import ConfigMixin, register_to_config
+    from diffusers.models.modeling_utils import ModelMixin
+    from diffusers.utils import BaseOutput
+except Exception:  # pragma: no cover - lets this subtree be tested outside diffusers.
+    class BaseOutput(dict):
+        def __post_init__(self):
+            self.update(self.__dict__)
+    class _Config(dict):
+        def __getattr__(self, key):
+            try:
+                return self[key]
+            except KeyError as error:
+                raise AttributeError(key) from error
+    class ConfigMixin:
+        config_name = "config.json"
+    class ModelMixin(nn.Module):
+        pass
+    def register_to_config(init):
+        def wrapper(self, *args, **kwargs):
+            import inspect
+            signature = inspect.signature(init)
+            bound = signature.bind(self, *args, **kwargs)
+            bound.apply_defaults()
+            self.config = _Config({key: value for key, value in bound.arguments.items() if key != "self"})
+            init(self, *args, **kwargs)
+        return wrapper
+try:
+    from flash_attn import flash_attn_varlen_func
+except Exception:  # pragma: no cover - optional acceleration.
+    flash_attn_varlen_func = None
+@dataclass
+class NiTTransformer2DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+    projection_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+def _modulate(hidden_states: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return hidden_states * (1 + scale) + shift
+def _rotate_half(hidden_states: torch.Tensor) -> torch.Tensor:
+    hidden_states = hidden_states.reshape(*hidden_states.shape[:-1], -1, 2)
+    hidden_states_1, hidden_states_2 = hidden_states.unbind(dim=-1)
+    return torch.stack((-hidden_states_2, hidden_states_1), dim=-1).flatten(-2)
+def _get_float_dtype_or_default(tensor: Optional[torch.Tensor] = None) -> torch.dtype:
+    if tensor is not None and tensor.is_floating_point():
+        return tensor.dtype
+    return torch.get_default_dtype()
+class NiTPatchEmbed(nn.Module):
+    def __init__(self, patch_size: int, in_channels: int, hidden_size: int):
+        super().__init__()
+        self.patch_size = (patch_size, patch_size)
+        self.proj = nn.Conv2d(in_channels, hidden_size, kernel_size=patch_size, stride=patch_size, bias=True)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.proj(hidden_states)
+        return hidden_states.flatten(2).transpose(1, 2)
+class NiTTimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+    @staticmethod
+    def get_timestep_embedding(timesteps: torch.Tensor, embedding_dim: int, max_period: int = 10000):
+        half = embedding_dim // 2
+        # Keep sinusoid construction in fp32 to mirror the native NiT implementation.
+        exponent = -math.log(max_period) * torch.arange(half, dtype=torch.float32, device=timesteps.device) / half
+        freqs = torch.exp(exponent)
+        args = timesteps.float()[:, None] * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if embedding_dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        timestep_freq = self.get_timestep_embedding(timesteps, self.frequency_embedding_size).to(timesteps.dtype)
+        return self.mlp(timestep_freq)
+class NiTLabelEmbedder(nn.Module):
+    def __init__(self, num_classes: int, hidden_size: int, dropout_prob: float):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + int(use_cfg_embedding), hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def forward(self, class_labels: torch.LongTensor) -> torch.Tensor:
+        return self.embedding_table(class_labels)
+class NiTRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        head_dim: int,
+        custom_freqs: str = "normal",
+        theta: int = 10000,
+        max_cached_len: int = 1024,
+        max_pe_len_h: Optional[int] = None,
+        max_pe_len_w: Optional[int] = None,
+        decouple: bool = False,
+        ori_max_pe_len: Optional[int] = None,
+    ):
+        super().__init__()
+        del max_pe_len_h, max_pe_len_w, decouple, ori_max_pe_len
+        if custom_freqs not in {"normal", "scale1", "scale2"}:
+            raise ValueError(
+                "This Diffusers implementation supports the trained RoPE frequencies directly. "
+                "Checkpoint conversion preserves weights; extrapolation variants should be handled "
+                "by changing the model config before loading."
+            )
+        dim = head_dim // 2
+        if dim % 2 != 0:
+            raise ValueError("NiT rotary embedding requires head_dim // 2 to be even.")
+        default_dtype = _get_float_dtype_or_default()
+        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=default_dtype) / dim))
+        self.register_buffer("freqs_h", freqs, persistent=False)
+        self.register_buffer("freqs_w", freqs.clone(), persistent=False)
+        positions = torch.arange(max_cached_len, dtype=default_dtype)
+        freqs_h_cached = torch.einsum("n,f->nf", positions, self.freqs_h).repeat_interleave(2, dim=-1)
+        freqs_w_cached = torch.einsum("n,f->nf", positions, self.freqs_w).repeat_interleave(2, dim=-1)
+        self.register_buffer("freqs_h_cached", freqs_h_cached, persistent=False)
+        self.register_buffer("freqs_w_cached", freqs_w_cached, persistent=False)
+    def forward(self, image_sizes: torch.LongTensor, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        grids = []
+        for height, width in image_sizes.tolist():
+            # Use the same meshgrid ordering as native NiT to preserve RoPE-token alignment.
+            grid_h = torch.arange(height, device=device)
+            grid_w = torch.arange(width, device=device)
+            grid = torch.meshgrid(grid_h, grid_w, indexing="xy")
+            grids.append(torch.stack(grid, dim=0).reshape(2, -1))
+        grid = torch.cat(grids, dim=1)
+        freqs_h = self.freqs_h_cached.to(device)[grid[0]]
+        freqs_w = self.freqs_w_cached.to(device)[grid[1]]
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1)
+        return freqs.cos().unsqueeze(1), freqs.sin().unsqueeze(1)
+class NiTAttention(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, qk_norm: bool = False):
+        super().__init__()
+        if hidden_size % num_heads != 0:
+            raise ValueError("hidden_size must be divisible by num_heads")
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.qkv = nn.Linear(hidden_size, hidden_size * 3, bias=True)
+        self.q_norm = nn.LayerNorm(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = nn.LayerNorm(self.head_dim) if qk_norm else nn.Identity()
+        self.proj = nn.Linear(hidden_size, hidden_size)
+        self.proj_drop = nn.Dropout(0.0)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.IntTensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv = self.qkv(hidden_states).reshape(hidden_states.shape[0], 3, self.num_heads, self.head_dim)
+        query, key, value = qkv.unbind(dim=1)
+        original_dtype = qkv.dtype
+        query = self.q_norm(query)
+        key = self.k_norm(key)
+        query = query * freqs_cos + _rotate_half(query) * freqs_sin
+        key = key * freqs_cos + _rotate_half(key) * freqs_sin
+        query = query.to(dtype=original_dtype)
+        key = key.to(dtype=original_dtype)
+        if flash_attn_varlen_func is not None and query.is_cuda:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            hidden_states = flash_attn_varlen_func(
+                query, key, value, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen
+            ).reshape(hidden_states.shape[0], -1)
+        else:
+            segments = []
+            for start, end in zip(cu_seqlens[:-1].tolist(), cu_seqlens[1:].tolist()):
+                q = query[start:end].transpose(0, 1).unsqueeze(0)
+                k = key[start:end].transpose(0, 1).unsqueeze(0)
+                v = value[start:end].transpose(0, 1).unsqueeze(0)
+                segments.append(F.scaled_dot_product_attention(q, k, v).squeeze(0).transpose(0, 1))
+            hidden_states = torch.cat(segments, dim=0).reshape(hidden_states.shape[0], -1)
+        hidden_states = self.proj(hidden_states)
+        return self.proj_drop(hidden_states)
+class NiTMLP(nn.Module):
+    def __init__(self, hidden_size: int, mlp_hidden_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(hidden_size, mlp_hidden_dim)
+        self.act = nn.GELU(approximate="tanh")
+        self.drop1 = nn.Dropout(0.0)
+        self.norm = nn.Identity()
+        self.fc2 = nn.Linear(mlp_hidden_dim, hidden_size)
+        self.drop2 = nn.Dropout(0.0)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.drop1(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return self.drop2(hidden_states)
+class NiTBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_norm: bool = False,
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 512,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = NiTAttention(hidden_size, num_heads=num_heads, qk_norm=qk_norm)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.mlp = NiTMLP(hidden_size, mlp_hidden_dim)
+        if use_adaln_lora:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(hidden_size, adaln_lora_dim, bias=True),
+                nn.Linear(adaln_lora_dim, 6 * hidden_size, bias=True),
+            )
+        else:
+            self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+    def forward(self, hidden_states, conditioning, cu_seqlens, freqs_cos, freqs_sin):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(conditioning).chunk(
+            6, dim=-1
+        )
+        hidden_states = hidden_states + gate_msa * self.attn(
+            _modulate(self.norm1(hidden_states), shift_msa, scale_msa), cu_seqlens, freqs_cos, freqs_sin
+        )
+        hidden_states = hidden_states + gate_mlp * self.mlp(
+            _modulate(self.norm2(hidden_states), shift_mlp, scale_mlp)
+        )
+        return hidden_states
+class NiTFinalLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, hidden_states: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
+        shift, scale = self.adaLN_modulation(conditioning).chunk(2, dim=-1)
+        hidden_states = _modulate(self.norm_final(hidden_states), shift, scale)
+        return self.linear(hidden_states)
+def _build_mlp(hidden_size: int, projector_dim: int, z_dim: int) -> nn.Sequential:
+    return nn.Sequential(
+        nn.Linear(hidden_size, projector_dim),
+        nn.SiLU(),
+        nn.Linear(projector_dim, projector_dim),
+        nn.SiLU(),
+        nn.Linear(projector_dim, z_dim),
+    )
+class NiTTransformer2DModel(ModelMixin, ConfigMixin):
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        input_size: int = 32,
+        patch_size: int = 1,
+        in_channels: int = 32,
+        hidden_size: int = 1152,
+        depth: int = 28,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        class_dropout_prob: float = 0.1,
+        num_classes: int = 1000,
+        encoder_depth: int = 8,
+        projector_dim: int = 2048,
+        z_dim: int = 1280,
+        use_checkpoint: bool = False,
+        custom_freqs: str = "normal",
+        theta: int = 10000,
+        max_pe_len_h: Optional[int] = None,
+        max_pe_len_w: Optional[int] = None,
+        decouple: bool = False,
+        ori_max_pe_len: Optional[int] = None,
+        qk_norm: bool = True,
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 512,
+    ):
+        super().__init__()
+        del input_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.num_classes = num_classes
+        self.encoder_depth = encoder_depth
+        self.use_checkpoint = use_checkpoint
+        self.x_embedder = NiTPatchEmbed(patch_size, in_channels, hidden_size)
+        self.t_embedder = NiTTimestepEmbedder(hidden_size)
+        self.y_embedder = NiTLabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        self.rope = NiTRotaryEmbedding(
+            hidden_size // num_heads,
+            custom_freqs=custom_freqs,
+            theta=theta,
+            max_pe_len_h=max_pe_len_h,
+            max_pe_len_w=max_pe_len_w,
+            decouple=decouple,
+            ori_max_pe_len=ori_max_pe_len,
+        )
+        self.projector = _build_mlp(hidden_size, projector_dim, z_dim)
+        self.blocks = nn.ModuleList(
+            [
+                NiTBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    use_adaln_lora=use_adaln_lora,
+                    adaln_lora_dim=adaln_lora_dim,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.final_layer = NiTFinalLayer(hidden_size, patch_size, self.out_channels)
+    def _pack_latents(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.LongTensor, Tuple[int, int]]:
+        batch_size, channels, height, width = hidden_states.shape
+        if channels != self.in_channels:
+            raise ValueError(f"Expected {self.in_channels} latent channels, got {channels}.")
+        if height % self.patch_size != 0 or width % self.patch_size != 0:
+            raise ValueError("Latent height and width must be divisible by patch_size.")
+        latent_h = height // self.patch_size
+        latent_w = width // self.patch_size
+        hidden_states = hidden_states.reshape(batch_size, channels, latent_h, self.patch_size, latent_w, self.patch_size)
+        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5).reshape(
+            batch_size * latent_h * latent_w, channels, self.patch_size, self.patch_size
+        )
+        image_sizes = torch.tensor([[latent_h, latent_w]] * batch_size, device=hidden_states.device, dtype=torch.long)
+        return hidden_states, image_sizes, (height, width)
+    def _unpack_latents(self, hidden_states: torch.Tensor, image_sizes: torch.LongTensor) -> torch.Tensor:
+        if image_sizes.shape[0] == 1:
+            height, width = image_sizes[0].tolist()
+            hidden_states = hidden_states.reshape(height, width, self.out_channels, self.patch_size, self.patch_size)
+            return hidden_states.permute(2, 0, 3, 1, 4).reshape(
+                1, self.out_channels, height * self.patch_size, width * self.patch_size
+            )
+        samples = []
+        cursor = 0
+        for height, width in image_sizes.tolist():
+            length = height * width
+            sample = hidden_states[cursor : cursor + length].reshape(
+                height, width, self.out_channels, self.patch_size, self.patch_size
+            )
+            samples.append(
+                sample.permute(2, 0, 3, 1, 4).reshape(
+                    self.out_channels, height * self.patch_size, width * self.patch_size
+                )
+            )
+            cursor += length
+        if len({tuple(sample.shape) for sample in samples}) != 1:
+            return hidden_states
+        return torch.stack(samples, dim=0)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Union[torch.Tensor, float],
+        class_labels: torch.LongTensor,
+        image_sizes: Optional[Union[torch.LongTensor, List[Tuple[int, int]]]] = None,
+        return_dict: bool = True,
+        output_projection_states: bool = False,
+    ) -> Union[NiTTransformer2DModelOutput, Tuple[torch.Tensor, ...]]:
+        input_was_image = hidden_states.dim() == 4 and image_sizes is None
+        if input_was_image:
+            hidden_states, image_sizes, _ = self._pack_latents(hidden_states)
+        elif image_sizes is None:
+            raise ValueError("image_sizes must be provided when hidden_states are already packed.")
+        elif not torch.is_tensor(image_sizes):
+            image_sizes = torch.tensor(image_sizes, device=hidden_states.device, dtype=torch.long)
+        else:
+            image_sizes = image_sizes.to(device=hidden_states.device, dtype=torch.long)
+        if not torch.is_tensor(timestep):
+            timestep = torch.tensor([timestep], device=hidden_states.device, dtype=hidden_states.dtype)
+        timestep = timestep.to(device=hidden_states.device, dtype=hidden_states.dtype).flatten()
+        if timestep.numel() == 1:
+            timestep = timestep.repeat(image_sizes.shape[0])
+        class_labels = class_labels.to(device=hidden_states.device, dtype=torch.long).flatten()
+        hidden_states = self.x_embedder(hidden_states).squeeze(1)
+        freqs_cos, freqs_sin = self.rope(image_sizes, hidden_states.device)
+        seqlens = image_sizes[:, 0] * image_sizes[:, 1]
+        cu_seqlens = torch.cat(
+            [torch.zeros(1, device=hidden_states.device, dtype=torch.int32), torch.cumsum(seqlens, dim=0).int()]
+        )
+        conditioning = self.t_embedder(timestep) + self.y_embedder(class_labels)
+        conditioning = torch.cat([conditioning[i].repeat(int(seqlens[i]), 1) for i in range(image_sizes.shape[0])], dim=0)
+        projection_states = []
+        for index, block in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block, hidden_states, conditioning, cu_seqlens, freqs_cos, freqs_sin, use_reentrant=False
+                )
+            else:
+                hidden_states = block(hidden_states, conditioning, cu_seqlens, freqs_cos, freqs_sin)
+            if output_projection_states and (index + 1) == self.encoder_depth:
+                projection_states.append(self.projector(hidden_states))
+        hidden_states = self.final_layer(hidden_states, conditioning)
+        hidden_states = hidden_states.reshape(hidden_states.shape[0], self.out_channels, self.patch_size, self.patch_size)
+        if input_was_image:
+            hidden_states = self._unpack_latents(hidden_states, image_sizes)
+        if not return_dict:
+            output = (hidden_states,)
+            if output_projection_states:
+                output = output + (tuple(projection_states),)
+            return output
+        return NiTTransformer2DModelOutput(
+            sample=hidden_states,
+            projection_states=tuple(projection_states) if output_projection_states else None,
+        )

demo_images/demo_sde250_class207_seed42.png ADDED Viewed

Git LFS Details

SHA256: eb6fd6d24d517744a597a8d5f3277f1b7a4a91834dbefba0607c69d04ceecd3f
Pointer size: 131 Bytes
Size of remote file: 453 kB

model_index.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_class_name": "NiTPipeline",
+  "_diffusers_version": "0.30.1",
+  "scheduler": [
+    "diffusers",
+    "NiTFlowMatchScheduler"
+  ],
+  "transformer": [
+    "diffusers",
+    "NiTTransformer2DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderDC"
+  ]
+}

pipeline.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Custom pipeline entrypoint for Diffusers dynamic loading."""
+from .custom_pipeline.pipeline_nit import NiTPipeline
+from .custom_pipeline.scheduling_flow_match_nit import NiTFlowMatchScheduler
+from .custom_pipeline.transformer_nit import NiTTransformer2DModel
+try:
+    import diffusers
+    setattr(diffusers, "NiTPipeline", NiTPipeline)
+    setattr(diffusers, "NiTTransformer2DModel", NiTTransformer2DModel)
+    setattr(diffusers, "NiTFlowMatchScheduler", NiTFlowMatchScheduler)
+except Exception:
+    pass
+__all__ = [
+    "NiTPipeline",
+    "NiTTransformer2DModel",
+    "NiTFlowMatchScheduler",
+]

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_class_name": "NiTFlowMatchScheduler",
+  "mode": "ode",
+  "num_train_timesteps": 1000,
+  "path_type": "linear"
+}

test_inference.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python3
+"""
+Standalone inference script for the NiT-XL Diffusers checkpoint.
+This script only uses code vendored in this model repository:
+`custom_pipeline/` for NiT pipeline, transformer, and scheduler classes.
+"""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+import torch
+from diffusers import DiffusionPipeline
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run class-conditional NiT-XL inference.")
+    parser.add_argument(
+        "--model-dir",
+        type=Path,
+        default=Path(__file__).resolve().parent,
+        help="Path to model repository root.",
+    )
+    parser.add_argument("--class-label", type=int, default=207, help="ImageNet class label to sample.")
+    parser.add_argument("--height", type=int, default=512, help="Output image height.")
+    parser.add_argument("--width", type=int, default=512, help="Output image width.")
+    parser.add_argument("--steps", type=int, default=250, help="Number of inference steps.")
+    parser.add_argument("--mode", choices=["ode", "sde"], default="sde", help="Sampling mode.")
+    parser.add_argument("--guidance-scale", type=float, default=2.05, help="Classifier-free guidance scale.")
+    parser.add_argument("--guidance-low", type=float, default=0.0, help="Guidance start timestep fraction.")
+    parser.add_argument("--guidance-high", type=float, default=0.7, help="Guidance end timestep fraction.")
+    parser.add_argument("--heun", action="store_true", help="Enable Heun correction for ODE mode.")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("demo_images/demo_sde250_class207_seed42.png"),
+        help="Output image path relative to model dir, or absolute path.",
+    )
+    return parser.parse_args()
+def resolve_output_path(model_dir: Path, output: Path) -> Path:
+    if output.is_absolute():
+        return output
+    return model_dir / output
+def main() -> None:
+    args = parse_args()
+    model_dir = args.model_dir.resolve()
+    custom_dir = model_dir / "custom_pipeline"
+    if not custom_dir.exists():
+        raise FileNotFoundError(f"Missing custom pipeline dir: {custom_dir}")
+    if not (model_dir / "pipeline.py").exists():
+        raise FileNotFoundError(f"Missing custom entrypoint: {model_dir / 'pipeline.py'}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch_dtype = torch.bfloat16 if device.type == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
+    generator_device = device.type if device.type != "cpu" else "cpu"
+    generator = torch.Generator(device=generator_device).manual_seed(args.seed)
+    pipe = DiffusionPipeline.from_pretrained(
+        model_dir,
+        custom_pipeline=str(model_dir / "pipeline.py"),
+        local_files_only=True,
+    ).to(device=device)
+    if device.type == "cuda":
+        pipe.transformer.to(dtype=torch_dtype)
+        pipe.vae.to(dtype=torch_dtype)
+    output = pipe(
+        class_labels=[args.class_label],
+        height=args.height,
+        width=args.width,
+        num_inference_steps=args.steps,
+        mode=args.mode,
+        guidance_scale=args.guidance_scale,
+        guidance_interval=(args.guidance_low, args.guidance_high),
+        heun=args.heun,
+        generator=generator,
+        output_type="pil",
+    )
+    output_path = resolve_output_path(model_dir, args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output.images[0].save(output_path)
+    print(f"Saved image to: {output_path}")
+    print(f"Device: {device} | dtype: {torch_dtype}")
+if __name__ == "__main__":
+    main()

transformer/config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_class_name": "NiTTransformer2DModel",
+  "class_dropout_prob": 0.1,
+  "depth": 28,
+  "encoder_depth": 8,
+  "hidden_size": 1152,
+  "in_channels": 32,
+  "input_size": 32,
+  "num_classes": 1000,
+  "num_heads": 16,
+  "patch_size": 1,
+  "qk_norm": true,
+  "z_dim": 1280
+}

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68cf19eb16e2231d1493dbb2c1bc7922fdfb23cc1e4b209aca6b6282238aa83b
+size 2736207096

vae/config.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "_class_name": "AutoencoderDC",
+  "_diffusers_version": "0.32.2",
+  "attention_head_dim": 32,
+  "decoder_act_fns": "silu",
+  "decoder_block_out_channels": [
+    128,
+    256,
+    512,
+    512,
+    1024,
+    1024
+  ],
+  "decoder_block_types": [
+    "ResBlock",
+    "ResBlock",
+    "ResBlock",
+    "EfficientViTBlock",
+    "EfficientViTBlock",
+    "EfficientViTBlock"
+  ],
+  "decoder_layers_per_block": [
+    3,
+    3,
+    3,
+    3,
+    3,
+    3
+  ],
+  "decoder_norm_types": "rms_norm",
+  "decoder_qkv_multiscales": [
+    [],
+    [],
+    [],
+    [
+      5
+    ],
+    [
+      5
+    ],
+    [
+      5
+    ]
+  ],
+  "downsample_block_type": "Conv",
+  "encoder_block_out_channels": [
+    128,
+    256,
+    512,
+    512,
+    1024,
+    1024
+  ],
+  "encoder_block_types": [
+    "ResBlock",
+    "ResBlock",
+    "ResBlock",
+    "EfficientViTBlock",
+    "EfficientViTBlock",
+    "EfficientViTBlock"
+  ],
+  "encoder_layers_per_block": [
+    2,
+    2,
+    2,
+    3,
+    3,
+    3
+  ],
+  "encoder_qkv_multiscales": [
+    [],
+    [],
+    [],
+    [
+      5
+    ],
+    [
+      5
+    ],
+    [
+      5
+    ]
+  ],
+  "in_channels": 3,
+  "latent_channels": 32,
+  "scaling_factor": 0.41407,
+  "upsample_block_type": "interpolate"
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfd991d1b54ffabf22745c5885589d8f2a7bc59930d95d92bd741c4fc64454bb
+size 1249044836