Spaces:

p1atdev
/

JiT-AnimeFace-Demo

Running on Zero

App Files Files Community

Plat commited on 13 days ago

Commit

4b08319

1 Parent(s): 5a8f4fc

init

Browse files

Files changed (8) hide show

.gitignore +14 -0
README.md +3 -3
app.py +206 -122
model/class_encoder.py +131 -0
model/config.py +96 -0
model/denoiser.py +833 -0
model/pipeline.py +412 -0
requirements.txt +2 -4

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+/models
+/output
+/notebooks

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: JiT AnimeFace Demo
-emoji: 🖼
-colorFrom: purple
 colorTo: red
 sdk: gradio
-sdk_version: 5.44.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
 title: JiT AnimeFace Demo
+emoji: 🚀
+colorFrom: red
 colorTo: red
 sdk: gradio
+sdk_version: 6.1.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,154 +1,238 @@
-import gradio as gr
-import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
 import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
                     maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
                 )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
                     minimum=1,
-                    maximum=50,
                     step=1,
-                    value=2,  # Replace with defaults that work for your model
                 )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
-    )
 if __name__ == "__main__":
-    demo.launch()

+import spaces
+import json
+import yaml
+import os
 import torch
+import gradio as gr
+from huggingface_hub import hf_hub_download
+from model.pipeline import JiTModel, JiTConfig
+from model.config import ClassContextConfig
+MODEL_REPO = os.environ.get("MODEL_REPO", "p1atdev/JiT-AnimeFace-experiment")
+MODEL_PATH = os.environ.get(
+    "MODEL_PATH", "jit-b256-p16-cls/12-jit-animeface_00043e_033368s.safetensors"
+)
+LABEL2ID_PATH = os.environ.get("LABEL2ID_PATH", "jit-b256-p16-cls/label2id.json")
+CONFIG_PATH = os.environ.get("CONFIG_PATH", "jit-b256-p16-cls/config.yml")
+DEVICE = (
+    torch.device("cuda")
+    if torch.cuda.is_available()
+    else torch.device("mps")
+    if torch.backends.mps.is_available()
+    else torch.device("cpu")
+)
+MAX_TOKEN_LENGTH = 32
+model_map: dict[str, JiTModel] = {}  # {model_path: model}
+label2id_map: dict[str, dict] = {}  # {label2id_path: label2id}
+def get_file_path(repo: str, path: str) -> str:
+    """Hugging Face Hub からファイルを取得"""
+    return hf_hub_download(repo, path)
+def load_label2id(label2id_path: str) -> dict:
+    """label2id.json を読み込む"""
+    with open(label2id_path, "r") as f:
+        return json.load(f)
+def load_config(config_path: str) -> JiTConfig:
+    """設定ファイルを読み込む"""
+    with open(config_path, "r") as f:
+        if config_path.endswith(".json"):
+            config_dict = json.load(f)
+        elif config_path.endswith((".yaml", ".yml")):
+            config_dict = yaml.safe_load(f)
+        else:
+            raise ValueError("Unsupported config file format. Use .json or .yaml/.yml")
+    return JiTConfig.model_validate(config_dict)
+def load_model(
+    model_path: str,
+    label2id_path: str,
+    config_path: str,
+    device: torch.device,
+) -> tuple[JiTModel, dict]:
+    """モデルを読み込む"""
+    if model_path in model_map:  # use cache
+        model = model_map[model_path]
+        label2id = label2id_map[label2id_path]
+        return model, label2id
+    config = load_config(get_file_path(MODEL_REPO, config_path))
+    if isinstance(config.context_encoder, ClassContextConfig):
+        config.context_encoder.label2id_map_path = get_file_path(
+            MODEL_REPO, label2id_path
+        )
+    model = JiTModel.from_pretrained(
+        config=config,
+        checkpoint_path=get_file_path(MODEL_REPO, model_path),
+    )
+    model.eval()
+    model.requires_grad_(False)
+    model.to(device=device)
+    model_map[model_path] = model  # cache
+    label2id = load_label2id(get_file_path(MODEL_REPO, label2id_path))
+    label2id_map[label2id_path] = label2id  # cache
+    return model, label2id
+@spaces.GPU(duration=5)
+def generate_images(
+    prompt: str,
+    negative_prompt: str,
+    num_steps: int,
+    cfg_scale: float,
+    batch_size: int,
+    size: int,
+    seed: int,
+    #
+    model_path: str = MODEL_PATH,
+    label2id_path: str = LABEL2ID_PATH,
+    config_path: str = CONFIG_PATH,
+    progress=gr.Progress(track_tqdm=True),
+):
+    model, _label2id = load_model(
+        model_path=model_path,
+        label2id_path=label2id_path,
+        config_path=config_path,
+        device=DEVICE,
+    )
+    with torch.inference_mode():
+        images = model.generate(
+            prompt=[prompt] * batch_size,
+            negative_prompt=negative_prompt,
+            num_inference_steps=num_steps,
+            cfg_scale=cfg_scale,
+            height=size,
+            width=size,
+            max_token_length=MAX_TOKEN_LENGTH,
+            cfg_time_range=[0.1, 1.0],
+            seed=seed if seed >= 0 else None,
+            device=DEVICE,
+            execution_dtype=model.config.torch_dtype,
+        )
+    return images
+def demo():
+    with gr.Blocks() as ui:
+        gr.Markdown(f"""
+# JiT-AnimeFace Demo
+Pixel-space x-prediction flow-matching model for anime face generation, trained from scratch.
+See full supported tags: [label2id.json](https://huggingface.co/{MODEL_REPO}/blob/main/{LABEL2ID_PATH}).
+""")
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.TextArea(
+                    label="Prompt",
+                    info="Space-separated tags. Not all of danbooru tags are supported. See the link above for full supported tags.",
+                    value="general 1girl solo portrait looking_at_viewer blue_hair short_hair blush cat_ears open_mouth cat_ears animal_ears red_eyes white_background",
+                    placeholder="e.g.: general 1girl solo portrait looking_at_viewer",
+                )
+                negative_prompt = gr.TextArea(
+                    label="Negative Prompt",
+                    value="retro_artstyle 1990s_(style) sketch",
+                    lines=2,
+                    placeholder="e.g.: retro_artstyle 1990s_(style) sketch",
+                )
+                num_steps = gr.Slider(
+                    minimum=1,
+                    maximum=100,
+                    value=25,
+                    step=4,
+                    label="Number of Steps",
+                )
+                cfg_scale = gr.Slider(
+                    minimum=1.0,
                     maximum=10.0,
+                    value=3.0,
+                    step=0.25,
+                    label="CFG Scale",
                 )
+                batch_size = gr.Slider(
                     minimum=1,
+                    maximum=64,
+                    value=16,
                     step=1,
+                    label="Batch Size",
+                )
+                size = gr.Slider(
+                    minimum=64,
+                    maximum=320,
+                    value=256,
+                    step=64,
+                    label="Image Size",
+                )
+                seed = gr.Number(
+                    value=-1,
+                    label="Seed (-1 for random)",
                 )
+            with gr.Column(scale=2):
+                generate_button = gr.Button("Generate Images", variant="primary")
+                output_gallery = gr.Gallery(
+                    label="Generated Images",
+                    columns=4,
+                    height="768px",
+                    preview=False,
+                    show_label=True,
+                )
+            gr.Examples(
+                examples=[
+                    [
+                        "general 1girl solo portrait looking_at_viewer blue_hair short_hair blush cat_ears open_mouth cat_ears animal_ears red_eyes white_background",
+                        "retro_artstyle 1990s_(style) sketch",
+                    ]
+                ],
+                inputs=[prompt, negative_prompt],
+            )
+        gr.on(
+            triggers=[generate_button.click, prompt.submit],
+            fn=generate_images,
+            inputs=[
+                prompt,
+                negative_prompt,
+                num_steps,
+                cfg_scale,
+                batch_size,
+                size,
+                seed,
+            ],
+            outputs=output_gallery,
+        )
+    return ui
 if __name__ == "__main__":
+    load_model(
+        model_path=MODEL_PATH,
+        label2id_path=LABEL2ID_PATH,
+        config_path=CONFIG_PATH,
+        device=DEVICE,
+    )
+    demo().launch()

model/class_encoder.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import warnings
+from typing import NamedTuple
+import torch
+import torch.nn as nn
+PromptType = str | list[str]
+class ClassTokenizerOutput(NamedTuple):
+    class_ids: torch.Tensor
+    attention_mask: torch.Tensor
+class ClassTokenizer:
+    def __init__(
+        self,
+        label2id: dict[str, int],
+        splitter: str = " ",
+    ) -> None:
+        self.label2id = label2id
+        self.id2label = {v: k for k, v in label2id.items()}
+        self.splitter = splitter
+        self.pad_token_id = len(label2id)
+        assert all([id < len(label2id) for id in label2id.values()]), (
+            "All label IDs must be less than the number of classes."
+        )
+    def normalize_prompts(
+        self,
+        class_names: PromptType,
+    ) -> list[str]:
+        _class_names: list[str] = (
+            class_names if isinstance(class_names, list) else [class_names]
+        )
+        return _class_names
+    def tokenize(
+        self,
+        prompts: PromptType,
+        max_length: int = 32,
+    ) -> ClassTokenizerOutput:
+        # 1. Normalize class names
+        _prompts = self.normalize_prompts(prompts)
+        # 2. Convert to IDs
+        class_ids = []
+        masks = []
+        for text in _prompts:
+            ids = []
+            for label in text.split(self.splitter):
+                if label.strip() == "":
+                    continue
+                id = self.label2id.get(label.strip())
+                if id is not None:  # 0 is OK
+                    ids.append(id)
+                    masks.append(1)
+                else:
+                    warnings.warn(f"Label '{label}' not found in label2id mapping.")
+            class_ids.append(ids)
+        # 3. Pad to max_length
+        padded_class_ids = []
+        padded_masks = []
+        for _i, ids in enumerate(class_ids):
+            if len(ids) < max_length:
+                mask = [1] * len(ids) + [0] * (max_length - len(ids))
+                ids = ids + [self.pad_token_id] * (max_length - len(ids))  # padding idx
+            else:
+                mask = [1] * max_length
+                ids = ids[:max_length]
+            padded_class_ids.append(ids)
+            padded_masks.append(mask)
+        return ClassTokenizerOutput(
+            class_ids=torch.tensor(padded_class_ids, dtype=torch.long),
+            attention_mask=torch.tensor(padded_masks, dtype=torch.long),
+        )
+class ClassEncoderOutput(NamedTuple):
+    embeddings: torch.Tensor
+    attention_mask: torch.Tensor
+class ClassEncoder(nn.Module):
+    def __init__(
+        self,
+        label2id: dict[str, int],
+        embedding_dim: int,
+    ):
+        super().__init__()
+        self.num_classes = len(label2id)
+        self.pad_token_id = self.num_classes  # padding idx
+        self.embedding = nn.Embedding(
+            self.num_classes + 1,  # +1 for padding idx
+            embedding_dim,
+            padding_idx=self.num_classes,
+        )
+        self.tokenizer = ClassTokenizer(label2id)
+    def initialize_weights(self):
+        nn.init.normal_(self.embedding.weight, mean=0.0, std=0.02)
+    def encode_prompts(
+        self,
+        prompts: PromptType,
+        max_token_length: int = 32,
+    ):
+        # 1. Tokenize prompts
+        class_ids, attention_mask = self.tokenizer.tokenize(
+            prompts,
+            max_length=max_token_length,
+        )
+        # 3. Get embeddings
+        embeddings = self.embedding(class_ids.to(self.embedding.weight.device))
+        return ClassEncoderOutput(
+            embeddings=embeddings,
+            attention_mask=attention_mask,
+        )

model/config.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+import json
+from typing import Literal
+from pydantic import BaseModel
+FP32_STR = ["float32", "fp32"]
+FP16_STR = ["float16", "fp16", "half"]
+BF16_STR = ["bfloat16", "bf16"]
+def str_to_dtype(dtype_str: str) -> torch.dtype:
+    dtype_str = dtype_str.lower()
+    if dtype_str in FP32_STR:
+        return torch.float32
+    elif dtype_str in FP16_STR:
+        return torch.float16
+    elif dtype_str in BF16_STR:
+        return torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported dtype string: {dtype_str}")
+class DenoiserConfig(BaseModel):
+    patch_size: int = 16
+    in_channels: int = 3
+    out_channels: int = 3
+    hidden_size: int = 1024
+    depth: int = 24
+    num_heads: int = 16
+    mlp_ratio: float = 4.0
+    attn_dropout: float = 0.0
+    proj_dropout: float = 0.0
+    bottleneck_dim: int = 128
+    num_time_tokens: int = 4
+    rope_theta: float = 256.0
+    rope_axes_dims: list[int] = [16, 24, 24]
+    rope_axes_lens: list[int] = [256, 128, 128]
+    rope_zero_centered: list[bool] = [False, True, True]
+    context_dim: int
+class JiT_B_16_Config(DenoiserConfig):
+    patch_size: int = 16
+    depth: int = 12
+    hidden_size: int = 768
+    num_heads: int = 12
+    bottleneck_dim: int = 128
+    context_dim: int = 768
+    rope_axes_dims: list[int] = [16, 24, 24]  # sum = 64 = 768 / 12
+    rope_axes_lens: list[int] = [
+        256,  # max 256 token text
+        128,  # 2048x2048 image size
+        128,
+    ]
+ContextType = Literal["class", "text"]
+class ClassContextConfig(BaseModel):
+    type: Literal["class"] = "class"
+    label2id_map_path: str
+    @property
+    def label2id(self) -> dict[str, int]:
+        with open(self.label2id_map_path, "r") as f:
+            label2id = json.load(f)
+        return label2id
+class TextContextConfig(BaseModel):
+    type: Literal["text"] = "text"
+    pretrained_model: str = "p1atdev/Qwen3-VL-2B-Instruct-Text-Only"
+ContextConfig = ClassContextConfig | TextContextConfig
+class JiTConfig(BaseModel):
+    dtype: str = "float32"
+    context_encoder: ContextConfig
+    denoiser: DenoiserConfig = JiT_B_16_Config()
+    @property
+    def torch_dtype(self) -> torch.dtype:
+        return str_to_dtype(self.dtype)

model/denoiser.py ADDED Viewed

	@@ -0,0 +1,833 @@

+# Reference: https://github.com/LTH14/JiT/blob/main/model_jit.py
+import math
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+import torch.nn.functional as F
+from .config import DenoiserConfig
+# https://github.com/huggingface/diffusers/blob/66bf7ea5be7099c8a47b9cba135f276d55247447/src/diffusers/models/embeddings.py#L27
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class FP32RMSNorm(nn.RMSNorm):
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return F.rms_norm(
+            hidden_states.to(torch.float32),
+            self.normalized_shape,
+            weight=self.weight,
+            eps=self.eps,
+        ).to(hidden_states.dtype)
+class BottleneckPatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+    def __init__(
+        self,
+        patch_size: int = 16,
+        in_channels: int = 3,
+        bottleneck_dim: int = 128,
+        hidden_dim: int = 768,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.bottleneck_dim = bottleneck_dim
+        self.hidden_dim = hidden_dim
+        self.bias = bias
+        self.proj_1 = nn.Conv2d(
+            in_channels,
+            bottleneck_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+        self.proj_2 = nn.Conv2d(
+            bottleneck_dim,
+            hidden_dim,
+            kernel_size=1,
+            stride=1,
+            bias=bias,
+        )
+    def forward(self, image: torch.Tensor) -> torch.Tensor:
+        # B, C, H, W = image.shape
+        # [B, C, H, W]
+        # -> [B, bottleneck_dim, H/patch_size, W/patch_size] (proj_1)
+        # -> [B, hidden_dim, H/patch_size, W/patch_size] (proj_2)
+        # -> [B, hidden_dim, num_patches] (flatten)
+        # -> [B, num_patches, hidden_dim] (transpose)
+        patches = (
+            self.proj_2(
+                self.proj_1(image),
+            )
+            .flatten(2)
+            .transpose(1, 2)
+        )
+        return patches
+class TimestepEmbedder(nn.Module):
+    def __init__(
+        self,
+        hidden_dim: int,
+        freq_embedding_size: int = 256,
+    ):
+        super().__init__()
+        self.freq_embedding_size = freq_embedding_size
+        self.mlp = nn.Sequential(
+            nn.Linear(freq_embedding_size, hidden_dim, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_dim, hidden_dim, bias=True),
+        )
+    def forward(self, timestep: torch.Tensor) -> torch.Tensor:
+        freq_emb = get_timestep_embedding(
+            timestep,
+            embedding_dim=self.freq_embedding_size,
+            flip_sin_to_cos=True,
+            downscale_freq_shift=0,
+        )
+        time_embed = self.mlp(freq_emb.to(dtype=self.mlp[0].weight.dtype))
+        return time_embed
+def apply_rope(
+    inputs: torch.Tensor,  # (batch_size, num_heads, seq_len, dim)
+    freqs_cis: torch.Tensor,  # (batch_size, seq_len, dim//2) complex64
+) -> torch.Tensor:
+    batch_size, num_heads, seq_len, dim = inputs.shape
+    with torch.autocast(device_type="cuda", enabled=False):
+        inputs_cis = torch.view_as_complex(
+            inputs.float().view(batch_size, num_heads, seq_len, dim // 2, 2)
+        )
+        freqs_cis = freqs_cis.unsqueeze(1)  # (batch_size, 1, seq_len, dim//2)
+        output = torch.view_as_real(inputs_cis * freqs_cis).flatten(3)
+        return output.type_as(inputs)
+class RopeEmbedder:
+    def __init__(
+        self,
+        rope_theta: float = 256.0,  # ref: Z-Image
+        axes_dims: list[int] = [32, 64, 64],  # text, height, width
+        axes_lens: list[int] = [256, 128, 128],  # text, height, width
+        zero_centered: list[bool] = [False, True, True],
+    ):
+        self.rope_theta = rope_theta
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        self.zero_centered = zero_centered
+        # text starts with 0, image axes are zero-centered
+        self.freqs_cis = self.precompute_freqs_cis(
+            theta=self.rope_theta,
+            dims=self.axes_dims,
+            lens=self.axes_lens,
+            zero_centered=self.zero_centered,
+        )
+    @staticmethod
+    def get_rope_freqs(
+        dim: int,
+        min_position: int = 0,
+        max_position: int = 128,
+        theta: float = 10000.0,
+    ) -> torch.Tensor:
+        freqs = 1.0 / (
+            theta
+            ** (
+                torch.arange(0, dim, 2, dtype=torch.float64, device=torch.device("cpu"))
+                / dim
+            )
+        )
+        positions = torch.arange(
+            start=min_position,
+            end=max_position,
+            dtype=torch.float64,
+            device=torch.device("cpu"),
+        )
+        freqs = torch.outer(positions, freqs).float()  # (max_position, dim//2)
+        # ↓pos, → dim//2
+        # [ min_position * [1/θ^(0/dim), 1/θ^(2/dim), 1/θ^(4/dim), ..., 1/θ^((dim-2)/dim)]
+        #   ...
+        #   0 * [1/θ^(0/dim), 1/θ^(2/dim), 1/θ^(4/dim), ..., 1/θ^((dim-2)/dim)]
+        #   1 * [1/θ^(0/dim), 1/θ^(2/dim), 1/θ^(4/dim), ..., 1/θ^((dim-2)/dim)]
+        #   2 * [1/θ^(0/dim), 1/θ^(2/dim), 1/θ^(4/dim), ..., 1/θ^((dim-2)/dim)]
+        #   ...
+        #   max_position * [1/θ^(0/dim), 1/θ^(2/dim), 1/θ^(4/dim), ..., 1/θ^((dim-2)/dim)] ]
+        freqs_cis = torch.polar(
+            abs=torch.ones_like(freqs),
+            angle=freqs,
+        ).to(torch.complex64)  # (min_position~max_position, dim//2) complex64
+        # 大きさは変えずに回転を表す複素数
+        return freqs_cis
+    @staticmethod
+    def precompute_freqs_cis(
+        theta: float,
+        dims: list[int],
+        lens: list[int],
+        zero_centered: list[bool],
+    ):
+        freqs_cis = []
+        for i, (dim, len_) in enumerate(zip(dims, lens)):
+            freq_cis = RopeEmbedder.get_rope_freqs(
+                dim=dim,
+                min_position=(len_ // 2) - len_ if zero_centered[i] else 0,
+                max_position=len_ // 2 if zero_centered[i] else len_,
+                theta=theta,
+            )  # (len_, dim//2) complex64
+            freqs_cis.append(freq_cis)
+        return freqs_cis
+    # get frequencies for given position ids
+    def __call__(self, position_ids: torch.Tensor):
+        # move to device
+        freqs_cis = [fc.to(position_ids.device) for fc in self.freqs_cis]
+        result = []
+        for i in range(len(self.axes_dims)):
+            index = (
+                position_ids[..., i : i + 1]
+                .repeat(
+                    # match dimensions for each axis
+                    1,  # batch size?
+                    1,  # sequence length?
+                    freqs_cis[i].shape[-1],
+                )
+                .to(torch.int64)
+            )
+            result.append(
+                torch.gather(
+                    freqs_cis[i].unsqueeze(0).repeat(index.shape[0], 1, 1),
+                    dim=1,
+                    index=index,
+                )
+            )
+        return torch.cat(result, dim=-1)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        attn_dropout: float = 0.0,
+        proj_dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q_norm = FP32RMSNorm(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = FP32RMSNorm(self.head_dim) if qk_norm else nn.Identity()
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_dropout = nn.Dropout(attn_dropout)
+        self.to_o = nn.Linear(dim, dim)
+        self.proj_dropout = nn.Dropout(proj_dropout)
+    def _pre_attn_reshape(self, x: torch.Tensor):
+        batch_size, seq_len, dim = x.shape
+        # [B, N, D] -> [B, N, num_heads, D/num_heads] -> [B, num_heads, N, D/num_heads]
+        x = x.view(
+            batch_size,
+            seq_len,
+            self.num_heads,
+            self.head_dim,
+        ).permute(0, 2, 1, 3)  # [B, num_heads, N, head_dim]
+        return x
+    def _post_attn_reshape(self, x: torch.Tensor):
+        batch_size, num_heads, seq_len, head_dim = x.shape
+        # [B, num_heads, N, head_dim] -> [B, N, num_heads, head_dim] -> [B, N, D]
+        x = (
+            x.permute(0, 2, 1, 3)
+            .contiguous()
+            .view(batch_size, seq_len, num_heads * head_dim)
+        )
+        return x
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rope_freqs: torch.Tensor,
+        mask: torch.Tensor | None = None,  # 1: attend, 0: ignore
+    ) -> torch.Tensor:
+        batch_size, seq_len, _dim = hidden_states.shape
+        # QKV
+        q = self.to_q(hidden_states)
+        k = self.to_k(hidden_states)
+        v = self.to_v(hidden_states)
+        q = self._pre_attn_reshape(q)  # [B, num_heads, N, head_dim]
+        k = self._pre_attn_reshape(k)
+        v = self._pre_attn_reshape(v)
+        # QKNorm
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        q = apply_rope(q, rope_freqs)
+        k = apply_rope(k, rope_freqs)
+        if mask is not None:
+            # mask: (batch_size, seq_len) -> (batch_size, num_heads, seq_len, seq_len)
+            mask = (
+                mask.bool()
+                .view(batch_size, 1, 1, seq_len)
+                .expand(-1, self.num_heads, seq_len, -1)
+            )
+        attn = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=self.attn_dropout.p if self.training else 0.0,
+            attn_mask=mask,
+            is_causal=False,
+        ).to(hidden_states.dtype)
+        attn = self._post_attn_reshape(attn)
+        # output
+        out = self.to_o(attn)
+        out = self.proj_dropout(out)
+        return out
+class SwiGLU(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        hidden_dim = int(hidden_dim * 2 / 3)
+        self.w_1 = nn.Linear(dim, hidden_dim, bias=bias)
+        self.w_2 = nn.Linear(dim, hidden_dim, bias=bias)
+        self.w_3 = nn.Linear(hidden_dim, dim, bias=bias)
+        self.ffn_dropout = nn.Dropout(dropout)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x_1 = self.w_1(hidden_states)
+        x_2 = self.w_2(hidden_states)
+        x = F.silu(x_1) * x_2
+        x = self.w_3(self.ffn_dropout(x))
+        return x
+class FinalLayer(nn.Module):
+    def __init__(
+        self,
+        hidden_dim: int,
+        mlp_ratio: float,
+        patch_size: int,
+        out_channels: int,
+    ):
+        super().__init__()
+        self.norm_final = FP32RMSNorm(hidden_dim)
+        self.mlp = SwiGLU(
+            dim=hidden_dim,
+            hidden_dim=int(hidden_dim * mlp_ratio),
+            dropout=0.0,
+            bias=True,
+        )
+        self.linear = nn.Linear(
+            hidden_dim,
+            patch_size * patch_size * out_channels,
+            bias=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        x = self.norm_final(hidden_states)
+        x = self.mlp(x)
+        x = self.linear(x)
+        return x
+class JiTBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_dropout: float = 0.0,
+        proj_dropout: float = 0.0,
+        ffn_dropout: float = 0.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.norm1 = FP32RMSNorm(hidden_dim, eps=1e-6)
+        self.attn = Attention(
+            dim=hidden_dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+        )
+        self.norm2 = FP32RMSNorm(hidden_dim)
+        self.mlp = SwiGLU(
+            dim=hidden_dim,
+            hidden_dim=int(hidden_dim * mlp_ratio),
+            dropout=ffn_dropout,
+            bias=bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rope_freqs: torch.Tensor,
+        mask: torch.Tensor | None = None,
+    ):
+        # attn
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            rope_freqs,
+            mask=mask,
+        )
+        # mlp
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class JiT(nn.Module):
+    def __init__(self, config: DenoiserConfig):
+        super().__init__()
+        self.config = config
+        assert (config.hidden_size // config.num_heads) == sum(config.rope_axes_dims), (
+            "The sum of rope_axes_dims must equal to hidden_size / num_heads = head_dim."
+        )
+        self.num_axes = len(
+            config.rope_axes_dims
+        )  # 0: image_index, 1: height, 2: width
+        # image patch embedder
+        self.patch_embedder = BottleneckPatchEmbed(
+            patch_size=config.patch_size,
+            in_channels=config.in_channels,
+            bottleneck_dim=config.bottleneck_dim,
+            hidden_dim=config.hidden_size,
+            bias=True,
+        )
+        # timestep embedder
+        self.time_embedder = TimestepEmbedder(
+            hidden_dim=config.hidden_size,
+            freq_embedding_size=256,
+        )
+        self.time_position_embeds = nn.Parameter(
+            torch.randn(
+                config.num_time_tokens,
+                config.hidden_size,
+            ),
+            requires_grad=True,
+        )
+        # RoPE embedder
+        self.rope_embedder = RopeEmbedder(
+            rope_theta=config.rope_theta,
+            axes_dims=config.rope_axes_dims,
+            axes_lens=config.rope_axes_lens,
+            zero_centered=config.rope_zero_centered,
+        )
+        # class condition or text embedding
+        self.context_embedder = nn.Linear(
+            config.context_dim,
+            config.hidden_size,
+            bias=True,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                JiTBlock(
+                    hidden_dim=config.hidden_size,
+                    num_heads=config.num_heads,
+                    mlp_ratio=config.mlp_ratio,
+                    attn_dropout=config.attn_dropout,
+                    proj_dropout=config.proj_dropout,
+                    ffn_dropout=0.0,
+                    qkv_bias=True,
+                    qk_norm=True,
+                    bias=True,
+                )
+                for _ in range(config.depth)
+            ]
+        )
+        self.final_layer = FinalLayer(
+            hidden_dim=config.hidden_size,
+            mlp_ratio=config.mlp_ratio,
+            patch_size=config.patch_size,
+            out_channels=config.in_channels,
+        )
+        self.gradient_checkpointing = False
+    def initialize_weights(self):
+        # Initialize weights
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.RMSNorm):
+                nn.init.ones_(m.weight)
+        # patch embed
+        w_1 = self.patch_embedder.proj_1.weight
+        nn.init.xavier_uniform_(w_1.view([w_1.shape[0], -1]))
+        w_2 = self.patch_embedder.proj_2.weight
+        nn.init.xavier_uniform_(w_2.view([w_2.shape[0], -1]))
+        if self.patch_embedder.proj_2.bias is not None:
+            nn.init.zeros_(self.patch_embedder.proj_2.bias)
+        # time position embeds
+        nn.init.normal_(
+            self.time_position_embeds,
+            std=0.02,
+        )
+        # time embedder
+        nn.init.normal_(
+            self.time_embedder.mlp[0].weight,  # type: ignore
+            std=0.02,
+        )
+        nn.init.normal_(
+            self.time_embedder.mlp[2].weight,  # type: ignore
+            std=0.02,
+        )
+    def set_gradient_checkpointing(self, enable: bool = True):
+        self.gradient_checkpointing = enable
+    def prepare_image_position_ids(
+        self,
+        height: int,
+        width: int,
+        image_index: int,
+    ) -> torch.Tensor:
+        # [H/patch_size, W/patch_size]
+        patch_size = self.config.patch_size
+        h_patches = height // patch_size
+        w_patches = width // patch_size
+        position_ids = torch.zeros(
+            h_patches,
+            w_patches,
+            self.num_axes,
+        )
+        # image_index
+        position_ids[:, :, 0] = image_index  # image
+        # height (y-index)
+        position_ids[:, :, 1] = (
+            torch.arange(
+                h_patches,
+            )
+            .unsqueeze(1)
+            .repeat(1, w_patches)
+        )
+        # width (x-index)
+        position_ids[:, :, 2] = (
+            torch.arange(
+                w_patches,
+            )
+            .unsqueeze(0)
+            .repeat(h_patches, 1)
+        )
+        return position_ids.view(-1, self.num_axes)  # (num_patches, n_axes)
+    def prepare_context_position_ids(
+        self,
+        seq_len: int,
+        context_start_index: int = 0,
+        xy_position: int = 0,
+    ) -> torch.Tensor:
+        position_ids = torch.zeros(
+            seq_len,
+            self.num_axes,
+        )
+        # context_index (0, ..., seq_len-1)
+        position_ids[:, 0] = torch.arange(
+            context_start_index,
+            context_start_index + seq_len,
+        )  # text
+        # token indices are (0, 0)...(0, 0)
+        position_ids[:, 1] = xy_position
+        position_ids[:, 2] = xy_position
+        return position_ids
+    def prepare_time_position_ids(
+        self,
+        seq_len: int,
+        time_start_index: int,
+        xy_position: int = 0,
+    ) -> torch.Tensor:
+        position_ids = torch.zeros(
+            seq_len,
+            self.num_axes,
+        )
+        # time_index
+        position_ids[:, 0] = torch.arange(
+            time_start_index, time_start_index + seq_len
+        )  # time
+        # token indices are (0, 0)...(0, 0)
+        position_ids[:, 1] = xy_position
+        position_ids[:, 2] = xy_position
+        return position_ids
+    def unpatchify(
+        self,
+        patches: torch.Tensor,
+        height: int,
+        width: int,
+    ) -> torch.Tensor:
+        batch_size, num_patches, _patch_dim = patches.shape
+        patch_size = self.config.patch_size
+        out_channels = self.config.out_channels
+        h_patches = height // patch_size
+        w_patches = width // patch_size
+        assert num_patches == h_patches * w_patches, "Mismatch in number of patches"
+        # [B, N, patch_size*patch_size*C] -> [B, H_patch, W_patch, patch_size, patch_size, C]
+        patches = patches.view(
+            batch_size,
+            h_patches,
+            w_patches,
+            patch_size,
+            patch_size,
+            out_channels,
+        )
+        # [B, H_patch, W_patch, patch_size, patch_size, C]
+        # -> [B, C, H_patch, patch_size, W_patch, patch_size]
+        patches = patches.permute(0, 5, 1, 3, 2, 4)
+        # -> [B, C, H_img, W_img]
+        images = patches.reshape(batch_size, out_channels, height, width)
+        return images
+    def forward(
+        self,
+        image: torch.Tensor,  # [B, C, H, W]
+        timestep: torch.Tensor,  # [B]
+        context: torch.Tensor,  # [B, context_len, context_dim]
+        context_mask: torch.Tensor | None = None,  # [B, context_len]
+    ):
+        batch_size, _in_channels, height, width = image.shape
+        time_embed: torch.Tensor = self.time_embedder(timestep)  # [B, hidden_dim]
+        time_tokens = time_embed.unsqueeze(1).repeat(  # add seq_len dim
+            1,
+            self.time_position_embeds.shape[0],  # num_time_tokens
+            1,
+        ) + self.time_position_embeds.unsqueeze(0).repeat(  # add batch dim
+            batch_size,
+            1,
+            1,
+        )  # [B, num_time_tokens, hidden_dim]
+        num_time_tokens = time_tokens.shape[1]
+        context_embed = self.context_embedder(context)
+        context_len = context_embed.shape[1]
+        patches = self.patch_embedder(image)  # [B, N, hidden_dim]]
+        patches_len = patches.shape[1]
+        # context -> time -> patches
+        context_position_ids = self.prepare_context_position_ids(
+            seq_len=context_len,
+            context_start_index=0,
+        )
+        time_position_ids = self.prepare_time_position_ids(
+            seq_len=num_time_tokens,
+            time_start_index=context_len,
+        )
+        patches_position_ids = self.prepare_image_position_ids(
+            height=height,
+            width=width,
+            image_index=context_len + num_time_tokens,  # after context and time tokens
+        )
+        # actually: patches -> time -> context
+        position_ids = torch.cat(
+            [
+                patches_position_ids,
+                time_position_ids,
+                context_position_ids,
+            ],
+            dim=0,
+        ).view(1, -1, self.num_axes)  # (1, total_seq_len, n_axes)
+        # prepare RoPE
+        freqs_cis = (
+            self.rope_embedder(position_ids=position_ids)
+            .repeat(
+                batch_size,
+                1,
+                1,
+            )
+            .to(device=image.device)
+        )
+        # attention mask
+        if context_mask is not None:
+            patches_mask = torch.ones(batch_size, patches_len, device=image.device)
+            time_mask = torch.ones(batch_size, num_time_tokens, device=image.device)
+            mask = torch.cat(
+                [
+                    patches_mask,
+                    time_mask,
+                    context_mask.to(image.device),
+                ],
+                dim=1,
+            )
+        else:
+            # attend all
+            mask = torch.ones(
+                batch_size,
+                patches_len + num_time_tokens + context_len,
+                device=image.device,
+            )
+        for _i, block in enumerate(self.blocks):
+            tokens = torch.cat(
+                [
+                    patches,  # 16x16
+                    time_tokens,  # 4
+                    context_embed,  # 64
+                ],
+                dim=1,  # cat in seq_len dimension
+            )
+            if self.gradient_checkpointing and self.training:
+                patches = checkpoint.checkpoint(  # type: ignore
+                    block,
+                    tokens,
+                    freqs_cis,
+                    mask,
+                )[:, :patches_len, :]
+            else:
+                patches = block(
+                    tokens,
+                    rope_freqs=freqs_cis,
+                    mask=mask,
+                )[:, :patches_len, :]  # only keep patch tokens
+        patches = self.final_layer(patches)
+        pred_image = self.unpatchify(
+            patches,
+            height=height,
+            width=width,
+        )
+        return pred_image

model/pipeline.py ADDED Viewed

	@@ -0,0 +1,412 @@

+from tqdm import tqdm
+from PIL import Image
+import torch
+import torch.nn as nn
+import numpy as np
+from accelerate import init_empty_weights
+from safetensors.torch import load_file
+from .denoiser import JiT
+from .class_encoder import ClassEncoder
+from .config import JiTConfig, ClassContextConfig
+# from .text_encoder import TextEncoder
+# from ...modules.quant import replace_by_prequantized_weights
+# from ...utils import tensor as tensor_utils
+def tensor_to_images(
+    tensor: torch.Tensor,
+) -> list[Image.Image]:
+    # -1~1 -> 0~255
+    # denormalize
+    tensor = tensor.clamp(-1.0, 1.0)
+    tensor = (tensor + 1.0) / 2.0 * 255.0
+    # permute
+    tensor = tensor.permute(0, 2, 3, 1)  # [B, C, H, W] -> [B, H, W, C]
+    # convert to numpy array
+    image_array = tensor.cpu().float().numpy().astype(np.uint8)
+    return [Image.fromarray(image) for image in image_array]
+class JiTModel(nn.Module):
+    denoiser: JiT
+    denoiser_class: type[JiT] = JiT
+    class_encoder: ClassEncoder
+    def __init__(
+        self,
+        config: JiTConfig,
+    ):
+        super().__init__()
+        self.config = config
+        self.denoiser = self.denoiser_class(config.denoiser)
+        if isinstance(config.context_encoder, ClassContextConfig):
+            self.class_encoder = ClassEncoder(
+                label2id=config.context_encoder.label2id,
+                embedding_dim=config.denoiser.context_dim,
+            )
+        else:
+            raise NotImplementedError(
+                "Only ClassContextConfig is supported in this version."
+            )
+        self.progress_bar = tqdm
+    def _load_checkpoint(
+        self,
+        checkpoint_path: str,
+        strict: bool = True,
+    ):
+        state_dict = load_file(checkpoint_path)
+        # replace_by_prequantized_weights(self, state_dict)
+        self.denoiser.load_state_dict(
+            {
+                key[len("denoiser.") :]: value
+                for key, value in state_dict.items()
+                if key.startswith("denoiser.")
+            },
+            strict=strict,
+            assign=True,
+        )
+        if self.class_encoder is not None:
+            self.class_encoder.load_state_dict(
+                {
+                    key[len("class_encoder.") :]: value
+                    for key, value in state_dict.items()
+                    if key.startswith("class_encoder.")
+                },
+                strict=strict,
+                assign=True,
+            )
+        # if self.text_encoder is not None:
+        #     self.text_encoder.model.load_state_dict(
+        #         {
+        #             key[len("text_encoder.") :]: value
+        #             for key, value in state_dict.items()
+        #             if key.startswith("text_encoder.")
+        #         },
+        #         strict=strict,
+        #         assign=True,
+        #     )
+    @classmethod
+    def from_pretrained(
+        cls,
+        config: JiTConfig,
+        checkpoint_path: str,
+    ) -> "JiTModel":
+        with init_empty_weights():
+            model = cls(config)
+        model._load_checkpoint(checkpoint_path)
+        return model
+    @classmethod
+    def new_with_config(
+        cls,
+        config: JiTConfig,
+    ) -> "JiTModel":
+        with init_empty_weights():
+            model = cls(config)
+        model.denoiser.to_empty(device="cpu")
+        model.denoiser.initialize_weights()
+        if isinstance(config.context_encoder, ClassContextConfig):
+            model.class_encoder.to_empty(device="cpu")
+            model.class_encoder.initialize_weights()
+        else:
+            # model.text_encoder = TextEncoder.from_remote(
+            #     repo_id=config.context_encoder.pretrained_model,
+            # )
+            raise NotImplementedError(
+                "Only ClassContextConfig is supported in this version."
+            )
+        return model
+    def prepare_noisy_image(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        seed: int | None = None,
+    ):
+        if seed is not None:
+            generator = torch.Generator(device=device)
+            generator.manual_seed(seed)
+            noise = torch.randn(
+                (batch_size, 3, height, width),
+                dtype=dtype,
+                device=device,
+                generator=generator,
+            )
+        else:
+            noise = torch.randn(
+                (batch_size, 3, height, width),
+                dtype=dtype,
+                device=device,
+            )
+        return noise
+    def prepare_timesteps(
+        self,
+        num_inference_steps: int,
+        device: torch.device,
+    ):
+        timesteps = torch.linspace(
+            0.0,
+            1.0,
+            num_inference_steps + 1,
+            device=device,
+        )
+        return timesteps
+    def prepare_context_embeddings(
+        self,
+        prompts: str | list[str],
+        negative_prompt: str | list[str],
+        max_token_length: int = 64,
+        do_cfg: bool = False,
+    ):
+        # if self.text_encoder is not None:
+        #     encoder_output = self.text_encoder.encode_prompts(
+        #         prompts,
+        #         negative_prompts=negative_prompt,
+        #         use_negative_prompts=do_cfg,
+        #         max_token_length=max_token_length,
+        #     )
+        #     if do_cfg:
+        #         prompt_embeddings = torch.cat(
+        #             [
+        #                 encoder_output.positive_embeddings,
+        #                 encoder_output.negative_embeddings,
+        #             ]
+        #         )
+        #         attention_mask = torch.cat(
+        #             [
+        #                 encoder_output.positive_attention_mask,
+        #                 encoder_output.negative_attention_mask,
+        #             ]
+        #         )
+        #     else:
+        #         prompt_embeddings = encoder_output.positive_embeddings
+        #         attention_mask = encoder_output.positive_attention_mask
+        if self.class_encoder is not None:
+            embeddings, attention_mask = self.class_encoder.encode_prompts(
+                prompts,
+                max_token_length=max_token_length,
+            )
+            negative_embeddings, _ = self.class_encoder.encode_prompts(
+                negative_prompt,
+                max_token_length=max_token_length,
+            )
+            if do_cfg:
+                prompt_embeddings = torch.cat(
+                    [
+                        embeddings,
+                        negative_embeddings,
+                    ],
+                    dim=0,
+                )
+                attention_mask = torch.cat(
+                    [
+                        attention_mask,
+                        attention_mask,
+                    ],
+                    dim=0,
+                )
+            else:
+                prompt_embeddings = embeddings
+        else:
+            raise NotImplementedError("Only ClassEncoder is supported in this version.")
+        return prompt_embeddings, attention_mask
+    def to_pil_images(self, tensor: torch.Tensor) -> list[Image.Image]:
+        return tensor_to_images(tensor)
+    def image_to_velocity(
+        self,
+        image: torch.Tensor,
+        noisy: torch.Tensor,
+        timestep: torch.Tensor,
+        clamp_eps: float = 1e-5,
+    ):
+        return (image - noisy) / (1 - timestep.view(-1, 1, 1, 1)).clamp_min_(clamp_eps)
+    def renorm_cfg(
+        self,
+        positive_velocity: torch.Tensor,
+        cfg_velocity: torch.Tensor,
+    ) -> torch.Tensor:
+        positive_norm = torch.norm(positive_velocity, dim=-1, keepdim=True)
+        cfg_norm = torch.norm(cfg_velocity, dim=-1, keepdim=True)
+        new_cfg_velocity = cfg_velocity * (positive_norm / cfg_norm)
+        return new_cfg_velocity
+    def dynamic_thresholding(
+        self,
+        images: torch.Tensor,
+        percentile: float = 0.995,
+    ) -> torch.Tensor:
+        """
+        Apply dynamic thresholding to the images.
+        Args:
+            images (torch.Tensor): The input images tensor.
+            percentile (float): The percentile value for thresholding.
+        Returns:
+            torch.Tensor: The thresholded images tensor.
+        """
+        batch_size = images.shape[0]
+        flattened_images = images.view(batch_size, -1)
+        abs_images = torch.abs(flattened_images)
+        s = torch.quantile(abs_images, percentile, dim=1, keepdim=True)
+        s = torch.clamp(s, min=1.0).view(batch_size, 1, 1, 1)
+        thresholded_images = torch.clamp(images, -s, s) / s
+        return thresholded_images
+    def normalize_prompts(
+        self,
+        prompt: str | list[str],
+    ) -> list[str]:
+        return prompt if isinstance(prompt, list) else [prompt]
+    @torch.inference_mode()
+    def generate(
+        self,
+        prompt: str | list[str],
+        negative_prompt: str | list[str] | None = None,
+        width: int = 256,
+        height: int = 256,
+        num_inference_steps: int = 20,
+        cfg_scale: float = 2.0,
+        max_token_length: int = 64,
+        seed: int | None = None,
+        execution_dtype: torch.dtype = torch.bfloat16,
+        device: torch.device | str = torch.device("cuda"),
+        do_cfg_renorm: bool = False,
+        do_dynamic_thresholding: bool = False,
+        cfg_time_range: list[float] = [0.0, 1.0],
+        # do_offloading: bool = False,
+    ):
+        # 1. Prepare args
+        execution_device: torch.device = (
+            torch.device(device) if isinstance(device, str) else device
+        )
+        do_cfg = cfg_scale > 1.0
+        timesteps = self.prepare_timesteps(
+            num_inference_steps=num_inference_steps,
+            device=execution_device,
+        )
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # 3. prepare noise
+        noisy_image = self.prepare_noisy_image(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            dtype=execution_dtype,
+            device=execution_device,
+            seed=seed,
+        )
+        negative_prompts = [""] if negative_prompt is None else negative_prompt
+        negative_prompts = self.normalize_prompts(negative_prompts)
+        if len(negative_prompts) != batch_size and len(negative_prompts) == 1:
+            negative_prompts = negative_prompts * batch_size
+        prompt_embeddings, attention_mask = self.prepare_context_embeddings(
+            prompts=prompt,
+            negative_prompt=negative_prompts,
+            max_token_length=max_token_length,
+            do_cfg=do_cfg,
+        )
+        # 4. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as pbar:
+            for i, timestep in enumerate(timesteps[:-1]):
+                image_input = torch.cat([noisy_image] * 2) if do_cfg else noisy_image
+                batch_timestep = timestep.expand(image_input.shape[0])
+                model_pred = self.denoiser(
+                    image=image_input,
+                    timestep=batch_timestep,
+                    context=prompt_embeddings,
+                    context_mask=attention_mask,
+                )
+                if do_cfg and cfg_time_range[0] <= float(timestep) <= cfg_time_range[1]:
+                    image_pred_positive, image_pred_negative = model_pred.chunk(2)
+                    v_pred_positive = self.image_to_velocity(
+                        image=image_pred_positive,
+                        noisy=noisy_image,
+                        timestep=timestep.expand(batch_size),
+                    )
+                    v_pred_negative = self.image_to_velocity(
+                        image=image_pred_negative,
+                        noisy=noisy_image,
+                        timestep=timestep.expand(batch_size),
+                    )
+                    velocity = v_pred_positive + cfg_scale * (
+                        v_pred_positive - v_pred_negative
+                    )
+                    if do_cfg_renorm:
+                        velocity = self.renorm_cfg(
+                            positive_velocity=v_pred_positive,
+                            cfg_velocity=velocity,
+                        )
+                    if do_dynamic_thresholding:
+                        # re-calculate the image prediction after cfg
+                        image_pred = noisy_image + velocity * (1 - timestep)
+                        image_pred = self.dynamic_thresholding(image_pred)
+                        velocity = self.image_to_velocity(
+                            image=image_pred,
+                            noisy=noisy_image,
+                            timestep=timestep.expand(batch_size),
+                        )
+                else:
+                    velocity = self.image_to_velocity(
+                        image=model_pred[:batch_size],
+                        noisy=noisy_image,
+                        timestep=timestep.expand(batch_size),
+                    )
+                # new noisy image
+                noisy_image = noisy_image + velocity * (timesteps[i + 1] - timestep)
+                pbar.update()
+        # now it should be clean
+        clean_image = noisy_image
+        # to PIL images
+        pil_images = self.to_pil_images(clean_image.cpu())
+        return pil_images

requirements.txt CHANGED Viewed

@@ -1,6 +1,4 @@
-accelerate
-diffusers
-invisible_watermark
 torch
 transformers
-xformers

+spaces
 torch
 transformers
+huggingface-hub