Spaces:

dbaranchuk
/

Switti

Running on Zero

App Files Files Community

realantonvoronov commited on 10 days ago

Commit

55ca09f

•

1 Parent(s): 385f11a

init commit

Browse files

Files changed (23) hide show

README.md +7 -8
app.py +90 -73
models/__init__.py +95 -0
models/__pycache__/__init__.cpython-311.pyc +0 -0
models/__pycache__/basic_switti.cpython-311.pyc +0 -0
models/__pycache__/basic_vae.cpython-311.pyc +0 -0
models/__pycache__/clip.cpython-311.pyc +0 -0
models/__pycache__/helpers.cpython-311.pyc +0 -0
models/__pycache__/pipeline.cpython-311.pyc +0 -0
models/__pycache__/quant.cpython-311.pyc +0 -0
models/__pycache__/rope.cpython-311.pyc +0 -0
models/__pycache__/switti.cpython-311.pyc +0 -0
models/__pycache__/vqvae.cpython-311.pyc +0 -0
models/basic_switti.py +461 -0
models/basic_vae.py +289 -0
models/clip.py +50 -0
models/helpers.py +93 -0
models/pipeline.py +227 -0
models/quant.py +398 -0
models/rope.py +48 -0
models/switti.py +409 -0
models/vqvae.py +184 -0
requirements.txt +16 -6

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
 title: Switti
-emoji: 🖼
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 5.0.1
-app_file: app.py
-pinned: false
 short_description: Generate images with Switti
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Switti
 sdk: gradio
+emoji: 🚀
+colorFrom: red
+colorTo: red
+pinned: true
 short_description: Generate images with Switti
+preload_from_hub:
+  - yresearch/Switti
+  - yresearch/VQVAE-Switti
 ---

app.py CHANGED Viewed

@@ -2,59 +2,67 @@ import gradio as gr
 import numpy as np
 import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
 MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
 def infer(
     prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
     progress=gr.Progress(track_tqdm=True),
 ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
     image = pipe(
         prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
     return image, seed
 examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
 ]
 css = """
@@ -66,8 +74,8 @@ css = """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
         with gr.Row():
             prompt = gr.Text(
                 label="Prompt",
@@ -81,59 +89,66 @@ with gr.Blocks(css=css) as demo:
         result = gr.Image(label="Result", show_label=False)
         with gr.Accordion("Advanced Settings", open=False):
             negative_prompt = gr.Text(
                 label="Negative prompt",
                 max_lines=1,
                 placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
             )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
             with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
                 )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
                 )
             with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
                 )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
                     step=1,
-                    value=2,  # Replace with defaults that work for your model
                 )
-        gr.Examples(examples=examples, inputs=[prompt])
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=infer,
@@ -142,10 +157,12 @@ with gr.Blocks(css=css) as demo:
             negative_prompt,
             seed,
             randomize_seed,
-            width,
-            height,
             guidance_scale,
-            num_inference_steps,
         ],
         outputs=[result, seed],
     )

 import numpy as np
 import random
+import spaces
+from models import SwittiPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "yresearch/Switti"
+pipe = SwittiPipeline.from_pretrained(model_repo_id, device=device)
 MAX_SEED = np.iinfo(np.int32).max
+@spaces.GPU(duration=65)
 def infer(
     prompt,
+    negative_prompt="",
+    seed=42,
+    randomize_seed=False,
+    guidance_scale=4.0,
+    top_k=400,
+    top_p=0.95,
+    more_smooth=True,
+    smooth_start_si=2,
+    turn_off_cfg_start_si=10,
     progress=gr.Progress(track_tqdm=True),
 ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     image = pipe(
         prompt=prompt,
+        null_prompt=negative_prompt,
+        cfg=guidance_scale,
+        top_p=top_p,
+        top_k=top_k,
+        more_smooth=more_smooth,
+        smooth_start_si=smooth_start_si,
+        turn_off_cfg_start_si=turn_off_cfg_start_si,
+        seed=seed,
+    )[0]
     return image, seed
 examples = [
+         "Cute winter dragon baby, kawaii, Pixar, ultra detailed, glacial background, extremely realistic.",
+         "Cat as a wizard",
+         ("An ancient ruined archway on the moon, fantasy, ruins of an alien civilization, "
+         "concept art, blue sky, reflectionin water pool, large white planet rising behind it"),
+         ("A lizard that looks very much like a man, with developed muscles, leather armor "
+         "with metal elements, in the hands of a large trident decorated with ancient runes,"
+         " against the background of a small lake, everything is well drawn in the style of fantasy"),
+         ("The Mandalorian by masamune shirow, fighting stance, in the snow, "
+         "cinematic lighting, intricate detail, character design"),
+         "Phoenix woman brown skin asian eyes silver scales, full body, high detail",
+         ("Portrait of an alien family from the 1970’s, futuristic clothes, "
+         "absurd alien helmet, straight line, surreal, strange, absurd, photorealistic, "
+         "Hasselblad, Kodak, portra 800, 35mm lens, F 2.8, photo studio."),
+         ("32 – bit pixelated future Hiphop producer in glowing power street ware, "
+         "noriyoshi ohrai, in the style of minecraft tomer hanuka."),
 ]
 css = """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown(" # [Switti](https://yandex-research.github.io/switti)")
+        gr.Markdown("[Learn more](https://yandex-research.github.io/switti) about Switti.")
         with gr.Row():
             prompt = gr.Text(
                 label="Prompt",
         result = gr.Image(label="Result", show_label=False)
+        seed = gr.Number(
+            label="Seed",
+            minimum=0,
+            maximum=MAX_SEED,
+            value=0,
+        )
+        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+        guidance_scale = gr.Slider(
+            label="Guidance scale",
+            minimum=0.0,
+            maximum=10.,
+            step=0.5,
+            value=4.,
+        )
         with gr.Accordion("Advanced Settings", open=False):
             negative_prompt = gr.Text(
                 label="Negative prompt",
                 max_lines=1,
                 placeholder="Enter a negative prompt",
+                visible=True,
             )
             with gr.Row():
+                top_k = gr.Slider(
+                    label="Sampling top k",
+                    minimum=10,
+                    maximum=1000,
+                    step=10,
+                    value=400,
                 )
+                top_p = gr.Slider(
+                    label="Sampling top p",
+                    minimum=0.0,
+                    maximum=1.,
+                    step=0.01,
+                    value=0.95,
                 )
             with gr.Row():
+                more_smooth = gr.Checkbox(label="Smoothing with Gumbel softmax sampling", value=True)
+                smooth_start_si = gr.Slider(
+                    label="Smoothing starting scale",
+                    minimum=0,
+                    maximum=10,
+                    step=1,
+                    value=2,
                 )
+                turn_off_cfg_start_si = gr.Slider(
+                    label="Disable CFG from scale",
+                    minimum=0,
+                    maximum=10,
                     step=1,
+                    value=8,
                 )
+        gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=True)# cache_mode="lazy")
     gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=infer,
             negative_prompt,
             seed,
             randomize_seed,
             guidance_scale,
+            top_k,
+            top_p,
+            more_smooth,
+            smooth_start_si,
+            turn_off_cfg_start_si,
         ],
         outputs=[result, seed],
     )

models/__init__.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch.nn as nn
+from .clip import FrozenCLIPEmbedder
+from .switti import Switti
+from .vqvae import VQVAE
+from .pipeline import SwittiPipeline
+def build_models(
+    # Shared args
+    device,
+    patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),  # 10 steps by default
+    # VQVAE args
+    V=4096,
+    Cvae=32,
+    ch=160,
+    share_quant_resi=4,
+    # Switti args
+    depth=16,
+    rope=True,
+    rope_theta=10000,
+    rope_size=128,
+    use_swiglu_ffn=True,
+    use_ar=False,
+    use_crop_cond=True,
+    attn_l2_norm=True,
+    init_adaln=0.5,
+    init_adaln_gamma=1e-5,
+    init_head=0.02,
+    init_std=-1,  # init_std < 0: automated
+    drop_rate=0.0,
+    attn_drop_rate=0.0,
+    dpr=0,
+    norm_eps=1e-6,
+    # pipeline args
+    text_encoder_path="openai/clip-vit-large-patch14",
+    text_encoder_2_path="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
+) -> tuple[VQVAE, Switti]:
+    heads = depth
+    width = depth * 64
+    if dpr > 0:
+        dpr = dpr * depth / 24
+    # disable built-in initialization for speed
+    for clz in (
+        nn.Linear,
+        nn.LayerNorm,
+        nn.BatchNorm2d,
+        nn.SyncBatchNorm,
+        nn.Conv1d,
+        nn.Conv2d,
+        nn.ConvTranspose1d,
+        nn.ConvTranspose2d,
+    ):
+        setattr(clz, "reset_parameters", lambda self: None)
+    # build models
+    vae_local = VQVAE(
+        vocab_size=V,
+        z_channels=Cvae,
+        ch=ch,
+        test_mode=True,
+        share_quant_resi=share_quant_resi,
+        v_patch_nums=patch_nums,
+    ).to(device)
+    switti_wo_ddp = Switti(
+        depth=depth,
+        embed_dim=width,
+        num_heads=heads,
+        drop_rate=drop_rate,
+        attn_drop_rate=attn_drop_rate,
+        drop_path_rate=dpr,
+        norm_eps=norm_eps,
+        attn_l2_norm=attn_l2_norm,
+        patch_nums=patch_nums,
+        rope=rope,
+        rope_theta=rope_theta,
+        rope_size=rope_size,
+        use_swiglu_ffn=use_swiglu_ffn,
+        use_ar=use_ar,
+        use_crop_cond=use_crop_cond,
+    ).to(device)
+    switti_wo_ddp.init_weights(
+        init_adaln=init_adaln,
+        init_adaln_gamma=init_adaln_gamma,
+        init_head=init_head,
+        init_std=init_std,
+    )
+    text_encoder = FrozenCLIPEmbedder(text_encoder_path)
+    text_encoder_2 = FrozenCLIPEmbedder(text_encoder_2_path)
+    pipe = SwittiPipeline(switti_wo_ddp, vae_local, text_encoder, text_encoder_2, device)
+    return vae_local, switti_wo_ddp, pipe

models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.9 kB). View file

models/__pycache__/basic_switti.cpython-311.pyc ADDED Viewed

Binary file (23.3 kB). View file

models/__pycache__/basic_vae.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

models/__pycache__/clip.cpython-311.pyc ADDED Viewed

Binary file (3.01 kB). View file

models/__pycache__/helpers.cpython-311.pyc ADDED Viewed

Binary file (5.29 kB). View file

models/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

models/__pycache__/quant.cpython-311.pyc ADDED Viewed

Binary file (24.6 kB). View file

models/__pycache__/rope.cpython-311.pyc ADDED Viewed

Binary file (4.45 kB). View file

models/__pycache__/switti.cpython-311.pyc ADDED Viewed

Binary file (23.3 kB). View file

models/__pycache__/vqvae.cpython-311.pyc ADDED Viewed

Binary file (10.2 kB). View file

models/basic_switti.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import math
+import warnings
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from torch.nn.functional import scaled_dot_product_attention  # q, k, v: BHLc
+from models.helpers import DropPath
+from models.rope import apply_rotary_emb
+try:
+    from flash_attn.ops.fused_dense import fused_mlp_func
+except ImportError:
+    fused_mlp_func = None
+# this file only provides the blocks used in Switti transformer
+__all__ = ["FFN", "SwiGLUFFN", "RMSNorm", "AdaLNSelfCrossAttn", "AdaLNBeforeHead"]
+try:
+    from apex.normalization import FusedRMSNorm as RMSNorm
+except ImportError:
+    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
+    class RMSNorm(torch.nn.Module):
+        def __init__(self, dim: int, eps: float = 1e-6):
+            """
+            Initialize the RMSNorm normalization layer.
+            Args:
+                dim (int): The dimension of the input tensor.
+                eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+            Attributes:
+                eps (float): A small value added to the denominator for numerical stability.
+                weight (nn.Parameter): Learnable scaling parameter.
+            """
+            super().__init__()
+            self.eps = eps
+            self.weight = nn.Parameter(torch.ones(dim))
+        def _norm(self, x):
+            """
+            Apply the RMSNorm normalization to the input tensor.
+            Args:
+                x (torch.Tensor): The input tensor.
+            Returns:
+                torch.Tensor: The normalized tensor.
+            """
+            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        def forward(self, x):
+            """
+            Forward pass through the RMSNorm layer.
+            Args:
+                x (torch.Tensor): The input tensor.
+            Returns:
+                torch.Tensor: The output tensor after applying RMSNorm.
+            """
+            output = self._norm(x.float()).type_as(x)
+            return output * self.weight
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        drop=0.0,
+        fused_if_available=True,
+    ):
+        super().__init__()
+        self.fused_mlp_func = fused_mlp_func if fused_if_available else None
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU(approximate="tanh")
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop, inplace=True) if drop > 0 else nn.Identity()
+    def forward(self, x):
+        if self.fused_mlp_func is not None:
+            return self.drop(
+                self.fused_mlp_func(
+                    x=x,
+                    weight1=self.fc1.weight,
+                    weight2=self.fc2.weight,
+                    bias1=self.fc1.bias,
+                    bias2=self.fc2.bias,
+                    activation="gelu_approx",
+                    save_pre_act=self.training,
+                    return_residual=False,
+                    checkpoint_lvl=0,
+                    heuristic=0,
+                    process_group=None,
+                )
+            )
+        else:
+            return self.drop(self.fc2(self.act(self.fc1(x))))
+    def extra_repr(self) -> str:
+        return f"fused_mlp_func={self.fused_mlp_func is not None}"
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ff_mult: float = 8 / 3,
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            ff_mult (float, optional): Custom multiplier for hidden dimension. Defaults to 4.
+        """
+        super().__init__()
+        hidden_dim = int(dim * ff_mult)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.fused_mlp_func = None
+        self._init()
+    def _init(self):
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+    # @torch.compile
+    def _forward_silu_gating(self, x_gate: torch.Tensor, x_up: torch.Tensor):
+        return F.silu(x_gate) * x_up
+    def forward(self, x: torch.Tensor):
+        return self.down_proj(
+            self._forward_silu_gating(self.gate_proj(x), self.up_proj(x))
+        )
+    def extra_repr(self) -> str:
+        return f"fused_mlp_func={self.fused_mlp_func is not None}"
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        context_dim: int = 2048,
+        num_heads: int = 12,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        qk_norm: bool = False,
+    ):
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        assert attn_drop == 0.0
+        self.num_heads, self.head_dim = (
+            num_heads,
+            embed_dim // num_heads,
+        )
+        self.qk_norm = qk_norm
+        self.scale = 1 / math.sqrt(self.head_dim)
+        self.q_norm = nn.LayerNorm(embed_dim, eps=1e-6, elementwise_affine=False)
+        self.k_norm = nn.LayerNorm(embed_dim, eps=1e-6, elementwise_affine=False)
+        self.to_q = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.to_kv = nn.Linear(context_dim, embed_dim * 2, bias=True)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = (
+            nn.Dropout(proj_drop, inplace=True) if proj_drop > 0 else nn.Identity()
+        )
+        self.attn_drop = attn_drop
+        # only used during inference
+        self.caching, self.cached_k, self.cached_v = False, None, None
+    def kv_caching(self, enable: bool):
+        self.caching, self.cached_k, self.cached_v = enable, None, None
+    def forward(self, x, context, context_attn_bias=None, freqs_cis=None):
+        B, L, C = x.shape
+        context_B, context_L, context_C = context.shape
+        assert B == context_B
+        q = self.to_q(x).view(B, L, -1)  # BLD , self.num_heads, self.head_dim)
+        if self.qk_norm:
+            q = self.q_norm(q)
+        q = q.view(B, L, self.num_heads, self.head_dim)
+        q = q.permute(0, 2, 1, 3)  # BHLc
+        if self.cached_k is None:
+            # not using caches or first scale inference
+            kv = self.to_kv(context).view(B, context_L, 2, -1)  # qkv: BL3D
+            k, v = kv.permute(2, 0, 1, 3).unbind(dim=0)  # q or k or v: BLHD
+            if self.qk_norm:
+                k = self.k_norm(k)
+            k = k.view(B, context_L, self.num_heads, self.head_dim)
+            k = k.permute(0, 2, 1, 3)  # BHLc
+            v = v.view(B, context_L, self.num_heads, self.head_dim)
+            v = v.permute(0, 2, 1, 3)  # BHLc
+            if self.caching:
+                self.cached_k = k
+                self.cached_v = v
+        else:
+            k = self.cached_k
+            v = self.cached_v
+        if context_attn_bias is not None:
+            context_attn_bias = rearrange(context_attn_bias, "b j -> b 1 1 j")
+        dropout_p = self.attn_drop if self.training else 0.0
+        out = (
+            scaled_dot_product_attention(
+                query=q,
+                key=k,
+                value=v,
+                scale=self.scale,
+                attn_mask=context_attn_bias,
+                dropout_p=dropout_p,
+            )
+            .transpose(1, 2)
+            .reshape(B, L, C)
+        )
+        return self.proj_drop(self.proj(out))
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        block_idx: int,
+        embed_dim: int = 768,
+        num_heads: int = 12,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        qk_norm: bool = False,
+    ):
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        self.block_idx, self.num_heads, self.head_dim = (
+            block_idx,
+            num_heads,
+            embed_dim // num_heads,
+        )
+        self.qk_norm = qk_norm
+        self.scale = 1 / math.sqrt(self.head_dim)
+        self.q_norm = nn.LayerNorm(embed_dim, eps=1e-6, elementwise_affine=False)
+        self.k_norm = nn.LayerNorm(embed_dim, eps=1e-6, elementwise_affine=False)
+        self.to_qkv = nn.Linear(embed_dim, embed_dim * 3, bias=True)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = (
+            nn.Dropout(proj_drop, inplace=True) if proj_drop > 0 else nn.Identity()
+        )
+        self.attn_drop = attn_drop
+        # only used during inference
+        self.caching, self.cached_k, self.cached_v = False, None, None
+    def kv_caching(self, enable: bool):
+        self.caching, self.cached_k, self.cached_v = enable, None, None
+    # NOTE: attn_bias is None during inference because kv cache is enabled
+    def forward(self, x, attn_bias, freqs_cis: torch.Tensor = None):
+        B, L, C = x.shape
+        qkv = self.to_qkv(x).view(B, L, 3, -1)
+        q, k, v = qkv.permute(2, 0, 1, 3).unbind(dim=0)  # q or k or v: BLD
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        q = q.view(B, L, self.num_heads, self.head_dim)
+        q = q.permute(0, 2, 1, 3)  # BHLc
+        k = k.view(B, L, self.num_heads, self.head_dim)
+        k = k.permute(0, 2, 1, 3)  # BHLc
+        v = v.view(B, L, self.num_heads, self.head_dim)
+        v = v.permute(0, 2, 1, 3)  # BHLc
+        dim_cat = 2
+        if freqs_cis is not None:
+            q = apply_rotary_emb(q, freqs_cis=freqs_cis)
+            k = apply_rotary_emb(k, freqs_cis=freqs_cis)
+        if self.caching:
+            if self.cached_k is None:
+                self.cached_k = k
+                self.cached_v = v
+            else:
+                k = self.cached_k = torch.cat((self.cached_k, k), dim=dim_cat)
+                v = self.cached_v = torch.cat((self.cached_v, v), dim=dim_cat)
+        dropout_p = self.attn_drop if self.training else 0.0
+        out = (
+            scaled_dot_product_attention(
+                query=q,
+                key=k,
+                value=v,
+                scale=self.scale,
+                attn_mask=attn_bias,
+                dropout_p=dropout_p,
+            )
+            .transpose(1, 2)
+            .reshape(B, L, C)
+        )
+        return self.proj_drop(self.proj(out))
+    def extra_repr(self) -> str:
+        return f"attn_l2_norm={self.qk_norm}"
+class AdaLNSelfCrossAttn(nn.Module):
+    def __init__(
+        self,
+        block_idx,
+        last_drop_p,
+        embed_dim,
+        cond_dim,
+        num_heads,
+        mlp_ratio=4.0,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        qk_norm=False,
+        context_dim=None,
+        use_swiglu_ffn=False,
+        norm_eps=1e-6,
+        use_crop_cond=False,
+    ):
+        super().__init__()
+        assert attn_drop == 0.0
+        assert qk_norm
+        self.block_idx, self.last_drop_p, self.C = block_idx, last_drop_p, embed_dim
+        self.C, self.D = embed_dim, cond_dim
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.attn = SelfAttention(
+            block_idx=block_idx,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+        )
+        if context_dim:
+            self.cross_attn = CrossAttention(
+                embed_dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                attn_drop=attn_drop,
+                proj_drop=drop,
+                qk_norm=qk_norm,
+            )
+        else:
+            self.cross_attn = None
+        if use_swiglu_ffn:
+            self.ffn = SwiGLUFFN(dim=embed_dim)
+        else:
+            self.ffn = FFN(
+                in_features=embed_dim,
+                hidden_features=round(embed_dim * mlp_ratio),
+                drop=drop,
+            )
+        self.self_attention_norm1 = RMSNorm(embed_dim, eps=norm_eps)
+        self.self_attention_norm2 = RMSNorm(embed_dim, eps=norm_eps)
+        self.cross_attention_norm1 = RMSNorm(embed_dim, eps=norm_eps)
+        self.cross_attention_norm2 = RMSNorm(embed_dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(embed_dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(embed_dim, eps=norm_eps)
+        self.attention_y_norm = RMSNorm(context_dim, eps=norm_eps)
+        # AdaLN
+        lin = nn.Linear(cond_dim, 6 * embed_dim)
+        self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin)
+        self.fused_add_norm_fn = None
+        self.use_crop_cond = use_crop_cond
+        if use_crop_cond:
+            self.crop_cond_scales = nn.Parameter(torch.zeros(1, cond_dim))
+    # NOTE: attn_bias is None during inference because kv cache is enabled
+    def forward(
+        self,
+        x,
+        cond_BD,
+        attn_bias,
+        crop_cond=None,
+        context=None,
+        context_attn_bias=None,
+        freqs_cis=None,
+    ):  # C: embed_dim, D: cond_dim
+        if self.use_crop_cond:
+            assert crop_cond is not None
+            cond_BD = cond_BD + self.crop_cond_scales * crop_cond
+        gamma1, gamma2, scale1, scale2, shift1, shift2 = (
+            self.ada_lin(cond_BD).view(-1, 1, 6, self.C).unbind(2)
+        )
+        x = x + self.self_attention_norm2(
+            self.attn(
+                self.self_attention_norm1(x).mul(scale1.add(1)).add(shift1),
+                attn_bias=attn_bias,
+                freqs_cis=freqs_cis,
+            )
+        ).mul(gamma1)
+        if context is not None:
+            x = x + self.cross_attention_norm2(
+                self.cross_attn(
+                    self.cross_attention_norm1(x),
+                    self.attention_y_norm(context),
+                    context_attn_bias=context_attn_bias,
+                    freqs_cis=freqs_cis,
+                )
+            )
+        x = x + self.ffn_norm2(
+            self.ffn(self.ffn_norm1(x).mul(scale2.add(1)).add(shift2))
+        ).mul(gamma2)
+        return x
+class AdaLNBeforeHead(nn.Module):
+    def __init__(self, C, D, norm_layer):  # C: embed_dim, D: cond_dim
+        super().__init__()
+        self.C, self.D = C, D
+        self.ln_wo_grad = norm_layer(C, elementwise_affine=False)
+        self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), nn.Linear(D, 2 * C))
+    def forward(self, x_BLC: torch.Tensor, cond_BD: torch.Tensor):
+        scale, shift = self.ada_lin(cond_BD).view(-1, 1, 2, self.C).unbind(2)
+        return self.ln_wo_grad(x_BLC).mul(scale.add(1)).add_(shift)

models/basic_vae.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# this file only provides the 2 modules used in VQVAE
+__all__ = [ "Encoder", "Decoder"]
+"""
+References: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/modules/diffusionmodules/model.py
+"""
+# swish
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Upsample2x(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x):
+        return self.conv(F.interpolate(x, scale_factor=2, mode="nearest"))
+class Downsample2x(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
+    def forward(self, x):
+        return self.conv(F.pad(x, pad=(0, 1, 0, 1), mode="constant", value=0))
+class ResnetBlock(nn.Module):
+    def __init__(
+        self, *, in_channels, out_channels=None, dropout
+    ):  # conv_shortcut=False,  # conv_shortcut: always False in VAE
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout) if dropout > 1e-6 else nn.Identity()
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = torch.nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+        else:
+            self.nin_shortcut = nn.Identity()
+    def forward(self, x):
+        h = self.conv1(F.silu(self.norm1(x), inplace=True))
+        h = self.conv2(self.dropout(F.silu(self.norm2(h), inplace=True)))
+        return self.nin_shortcut(x) + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.C = in_channels
+        self.norm = Normalize(in_channels)
+        self.qkv = torch.nn.Conv2d(
+            in_channels, 3 * in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.w_ratio = int(in_channels) ** (-0.5)
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        qkv = self.qkv(self.norm(x))
+        B, _, H, W = qkv.shape  # should be B,3C,H,W
+        C = self.C
+        q, k, v = qkv.reshape(B, 3, C, H, W).unbind(1)
+        # compute attention
+        q = q.view(B, C, H * W).contiguous()
+        q = q.permute(0, 2, 1).contiguous()  # B,HW,C
+        k = k.view(B, C, H * W).contiguous()  # B,C,HW
+        w = torch.bmm(q, k).mul_(self.w_ratio)  # B,HW,HW
+        # w[B,i,j]=sum_c q[B,i,C]k[B,C,j]
+        w = F.softmax(w, dim=2)
+        # attend to values
+        v = v.view(B, C, H * W).contiguous()
+        w = w.permute(0, 2, 1).contiguous()  # B,HW,HW (first HW of k, second of q)
+        h = torch.bmm(v, w)  # B, C,HW (HW of q) h[B,C,j] = sum_i v[B,C,i] w[B,i,j]
+        h = h.view(B, C, H, W).contiguous()
+        return x + self.proj_out(h)
+def make_attn(in_channels, using_sa=True):
+    return AttnBlock(in_channels) if using_sa else nn.Identity()
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch=128,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks=2,
+        dropout=0.0,
+        in_channels=3,
+        z_channels,
+        double_z=False,
+        using_sa=True,
+        using_mid_sa=True,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.downsample_ratio = 2 ** (self.num_resolutions - 1)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in, out_channels=block_out, dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1 and using_sa:
+                    attn.append(make_attn(block_in, using_sa=True))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample2x(block_in)
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, using_sa=using_mid_sa)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, dropout=dropout
+        )
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            (2 * z_channels if double_z else z_channels),
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x):
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = self.down[i_level].downsample(h)
+        # middle
+        h = self.mid.block_2(self.mid.attn_1(self.mid.block_1(h)))
+        # end
+        h = self.conv_out(F.silu(self.norm_out(h), inplace=True))
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch=128,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks=2,
+        dropout=0.0,
+        in_channels=3,  # in_channels: raw img channels
+        z_channels,
+        using_sa=True,
+        using_mid_sa=True,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, dropout=dropout
+        )
+        self.mid.attn_1 = make_attn(block_in, using_sa=using_mid_sa)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in, out_channels=block_in, dropout=dropout
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in, out_channels=block_out, dropout=dropout
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1 and using_sa:
+                    attn.append(make_attn(block_in, using_sa=True))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample2x(block_in)
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, z):
+        # z to block_in
+        # middle
+        h = self.mid.block_2(self.mid.attn_1(self.mid.block_1(self.conv_in(z))))
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.conv_out(F.silu(self.norm_out(h), inplace=True))
+        return h

models/clip.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPTextModel, CLIPTokenizer
+class FrozenCLIPEmbedder(nn.Module):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    def __init__(
+        self,
+        version="openai/clip-vit-large-patch14",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+    ):
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version).to(device)
+        self.device = device
+        self.hidden_size = self.transformer.config.hidden_size
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        ).to(self.device)
+        outputs = self.transformer(**batch_encoding)
+        attn_bias = batch_encoding["attention_mask"].to(outputs["last_hidden_state"].dtype)
+        attn_bias[attn_bias == 0] = -float("inf")
+        attn_bias[attn_bias == 1] = 0.0
+        outputs["attn_bias"] = attn_bias
+        return outputs
+    @torch.no_grad()
+    def encode(self, text):
+        return self(text)

models/helpers.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+def sample_with_top_k_top_p_(
+    logits_BlV: torch.Tensor,
+    top_k: int = 0,
+    top_p: float = 0.0,
+    rng=None,
+    num_samples=1,
+) -> torch.Tensor:  # return idx, shaped (B, l)
+    B, l, V = logits_BlV.shape
+    if top_k > 0:
+        idx_to_remove = logits_BlV < logits_BlV.topk(
+            top_k, largest=True, sorted=False, dim=-1
+        )[0].amin(dim=-1, keepdim=True)
+        logits_BlV.masked_fill_(idx_to_remove, -torch.inf)
+    if top_p > 0:
+        sorted_logits, sorted_idx = logits_BlV.sort(dim=-1, descending=False)
+        sorted_idx_to_remove = sorted_logits.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p)
+        sorted_idx_to_remove[..., -1:] = False
+        logits_BlV.masked_fill_(
+            sorted_idx_to_remove.scatter(
+                sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove
+            ),
+            -torch.inf,
+        )
+    # sample (have to squeeze cuz torch.multinomial can only be used for 2D tensor)
+    replacement = num_samples >= 0
+    num_samples = abs(num_samples)
+    return torch.multinomial(
+        logits_BlV.softmax(dim=-1).view(-1, V),
+        num_samples=num_samples,
+        replacement=replacement,
+        generator=rng,
+    ).view(B, l, num_samples)
+def gumbel_softmax_with_rng(
+    logits: torch.Tensor,
+    tau: float = 1,
+    hard: bool = False,
+    eps: float = 1e-10,
+    dim: int = -1,
+    rng: torch.Generator | None = None,
+) -> torch.Tensor:
+    if rng is None:
+        return F.gumbel_softmax(logits=logits, tau=tau, hard=hard, eps=eps, dim=dim)
+    gumbels = (
+        -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format)
+        .exponential_(generator=rng)
+        .log()
+    )
+    gumbels = (logits + gumbels) / tau
+    y_soft = gumbels.softmax(dim)
+    if hard:
+        index = y_soft.max(dim, keepdim=True)[1]
+        y_hard = torch.zeros_like(
+            logits, memory_format=torch.legacy_contiguous_format
+        ).scatter_(dim, index, 1.0)
+        ret = y_hard - y_soft.detach() + y_soft
+    else:
+        ret = y_soft
+    return ret
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):  # taken from timm
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):  # taken from timm
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"(drop_prob=...)"

models/pipeline.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import torch
+from torchvision.transforms import ToPILImage
+from PIL.Image import Image as PILImage
+from models.vqvae import VQVAEHF
+from models.clip import FrozenCLIPEmbedder
+from models.switti import SwittiHF, get_crop_condition
+from models.helpers import sample_with_top_k_top_p_, gumbel_softmax_with_rng
+class SwittiPipeline:
+    vae_path = "yresearch/VQVAE-Switti"
+    text_encoder_path = "openai/clip-vit-large-patch14"
+    text_encoder_2_path = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+    def __init__(self, switti, vae, text_encoder, text_encoder_2, device):
+        self.switti = switti
+        self.vae = vae
+        self.text_encoder = text_encoder
+        self.text_encoder_2 = text_encoder_2
+        self.switti.eval()
+        self.vae.eval()
+        self.device = device
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, device="cuda"):
+        switti = SwittiHF.from_pretrained(pretrained_model_name_or_path).to(device)
+        vae = VQVAEHF.from_pretrained(cls.vae_path).to(device)
+        text_encoder = FrozenCLIPEmbedder(cls.text_encoder_path, device=device)
+        text_encoder_2 = FrozenCLIPEmbedder(cls.text_encoder_2_path, device=device)
+        return cls(switti, vae, text_encoder, text_encoder_2, device)
+    @staticmethod
+    def to_image(tensor):
+        return [ToPILImage()(
+            (255 * img.cpu().detach()).to(torch.uint8))
+        for img in tensor]
+    def _encode_prompt(self, prompt: str | list[str]):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        encodings = [
+            self.text_encoder.encode(prompt),
+            self.text_encoder_2.encode(prompt),
+        ]
+        prompt_embeds = torch.concat(
+            [encoding.last_hidden_state for encoding in encodings], dim=-1
+        )
+        pooled_prompt_embeds = encodings[-1].pooler_output
+        attn_bias = encodings[-1].attn_bias
+        return prompt_embeds, pooled_prompt_embeds, attn_bias
+    def encode_prompt(
+        self,
+        prompt: str | list[str],
+        null_prompt: str = "",
+        encode_null: bool = True,
+    ):
+        prompt_embeds, pooled_prompt_embeds, attn_bias = self._encode_prompt(prompt)
+        if encode_null:
+            B, L, hidden_dim = prompt_embeds.shape
+            pooled_dim = pooled_prompt_embeds.shape[1]
+            null_embeds, null_pooled_embeds, null_attn_bias = self._encode_prompt(null_prompt)
+            null_embeds = null_embeds[:, :L].expand(B, L, hidden_dim).to(prompt_embeds.device)
+            null_pooled_embeds = null_pooled_embeds.expand(B, pooled_dim).to(pooled_prompt_embeds.device)
+            null_attn_bias = null_attn_bias[:, :L].expand(B, L).to(attn_bias.device)
+            prompt_embeds = torch.cat([prompt_embeds, null_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embeds, null_pooled_embeds], dim=0)
+            attn_bias = torch.cat([attn_bias, null_attn_bias], dim=0)
+        return prompt_embeds, pooled_prompt_embeds, attn_bias
+    @torch.inference_mode()
+    def __call__(
+        self,
+        prompt: str | list[str],
+        null_prompt: str = "",
+        seed: int | None = None,
+        cfg: float = 4.0,
+        top_k: int = 400,
+        top_p: float = 0.95,
+        more_smooth: bool = False,
+        return_pil: bool = True,
+        smooth_start_si: int = 0,
+        turn_off_cfg_start_si: int = 10,
+        image_size: tuple[int, int] = (512, 512),
+    ) -> torch.Tensor | list[PILImage]:
+        """
+        only used for inference, on autoregressive mode
+        :param prompt: text prompt to generate an image
+        :param null_prompt: negative prompt for CFG
+        :param seed: random seed
+        :param cfg: classifier-free guidance ratio
+        :param top_k: top-k sampling
+        :param top_p: top-p sampling
+        :param more_smooth: sampling using gumbel softmax; only used in visualization, not used in FID/IS benchmarking
+        :return: if return_pil: list of PIL Images, else: torch.tensor (B, 3, H, W) in [0, 1]
+        """
+        assert not self.switti.training
+        switti = self.switti
+        vae = self.vae
+        vae_quant = self.vae.quantize
+        if seed is None:
+            rng = None
+        else:
+            switti.rng.manual_seed(seed)
+            rng = switti.rng
+        context, cond_vector, context_attn_bias = self.encode_prompt(prompt, null_prompt)
+        B = context.shape[0] // 2
+        cond_vector = switti.text_pooler(cond_vector)
+        if switti.use_crop_cond:
+            crop_coords = get_crop_condition(2 * B * [image_size[0]],
+                                             2 * B * [image_size[1]],
+                                             ).to(cond_vector.device)
+            crop_embed = switti.crop_embed(crop_coords.view(-1)).reshape(2 * B, switti.D)
+            crop_cond = switti.crop_proj(crop_embed)
+        else:
+            crop_cond = None
+        sos = cond_BD = cond_vector
+        lvl_pos = switti.lvl_embed(switti.lvl_1L)
+        if not switti.rope:
+            lvl_pos += switti.pos_1LC
+        next_token_map = (
+            sos.unsqueeze(1)
+            + switti.pos_start.expand(2 * B, switti.first_l, -1)
+            + lvl_pos[:, : switti.first_l]
+        )
+        cur_L = 0
+        f_hat = sos.new_zeros(B, switti.Cvae, switti.patch_nums[-1], switti.patch_nums[-1])
+        for b in switti.blocks:
+            b.attn.kv_caching(switti.use_ar) # Use KV caching if switti is in the AR mode
+            b.cross_attn.kv_caching(True)
+        for si, pn in enumerate(switti.patch_nums):  # si: i-th segment
+            ratio = si / switti.num_stages_minus_1
+            x_BLC = next_token_map
+            if switti.rope:
+                freqs_cis = switti.freqs_cis[:, cur_L : cur_L + pn * pn]
+            else:
+                freqs_cis = switti.freqs_cis
+            if si >= turn_off_cfg_start_si:
+                x_BLC = x_BLC[:B]
+                context = context[:B]
+                context_attn_bias = context_attn_bias[:B]
+                freqs_cis = freqs_cis[:B]
+                cond_BD = cond_BD[:B]
+                if crop_cond is not None:
+                    crop_cond = crop_cond[:B]
+                for b in switti.blocks:
+                    if b.attn.caching:
+                        b.attn.cached_k = b.attn.cached_k[:B]
+                        b.attn.cached_v = b.attn.cached_v[:B]
+                    if b.cross_attn.caching:
+                        b.cross_attn.cached_k = b.cross_attn.cached_k[:B]
+                        b.cross_attn.cached_v = b.cross_attn.cached_v[:B]
+            for block in switti.blocks:
+                x_BLC = block(
+                    x=x_BLC,
+                    cond_BD=cond_BD,
+                    attn_bias=None,
+                    context=context,
+                    context_attn_bias=context_attn_bias,
+                    freqs_cis=freqs_cis,
+                    crop_cond=crop_cond,
+                )
+            cur_L += pn * pn
+            logits_BlV = switti.get_logits(x_BLC, cond_BD)
+            # Guidance
+            if si < turn_off_cfg_start_si:
+                t = cfg * ratio
+                logits_BlV = (1 + t) * logits_BlV[:B] - t * logits_BlV[B:]
+            if more_smooth and si >= smooth_start_si:
+                # not used when evaluating FID/IS/Precision/Recall
+                gum_t = max(0.27 * (1 - ratio * 0.95), 0.005)  # refer to mask-git
+                idx_Bl = gumbel_softmax_with_rng(
+                    logits_BlV.mul(1 + ratio), tau=gum_t, hard=False, dim=-1, rng=rng,
+                )
+                h_BChw = idx_Bl @ vae_quant.embedding.weight.unsqueeze(0)
+            else:
+                # defaul nucleus sampling
+                idx_Bl = sample_with_top_k_top_p_(
+                    logits_BlV, rng=rng, top_k=top_k, top_p=top_p, num_samples=1,
+                )[:, :, 0]
+                h_BChw = vae_quant.embedding(idx_Bl)
+            h_BChw = h_BChw.transpose_(1, 2).reshape(B, switti.Cvae, pn, pn)
+            f_hat, next_token_map = vae_quant.get_next_autoregressive_input(
+                    si, len(switti.patch_nums), f_hat, h_BChw,
+            )
+            if si != switti.num_stages_minus_1:  # prepare for next stage
+                next_token_map = next_token_map.view(B, switti.Cvae, -1).transpose(1, 2)
+                next_token_map = (
+                    switti.word_embed(next_token_map)
+                    + lvl_pos[:, cur_L : cur_L + switti.patch_nums[si + 1] ** 2]
+                )
+                # double the batch sizes due to CFG
+                next_token_map = next_token_map.repeat(2, 1, 1)
+        for b in switti.blocks:
+            b.attn.kv_caching(False)
+            b.cross_attn.kv_caching(False)
+        # de-normalize, from [-1, 1] to [0, 1]
+        img = vae.fhat_to_img(f_hat).add(1).mul(0.5)
+        if return_pil:
+            img = self.to_image(img)
+        return img

models/quant.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import math
+from typing import List, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+from torch import distributed as tdist
+from torch import nn as nn
+from torch.nn import functional as F
+# this file only provides the VectorQuantizer2 used in VQVAE
+__all__ = ["VectorQuantizer2"]
+class VectorQuantizer2(nn.Module):
+    # VQGAN originally use beta=1.0, never tried 0.25; SD seems using 0.25
+    def __init__(
+        self,
+        vocab_size,
+        Cvae,
+        using_znorm,
+        beta: float = 0.25,
+        default_qresi_counts=0,
+        v_patch_nums=None,
+        quant_resi=0.5,
+        share_quant_resi=4,  # share_quant_resi: args.qsr
+    ):
+        super().__init__()
+        self.vocab_size: int = vocab_size
+        self.Cvae: int = Cvae
+        self.using_znorm: bool = using_znorm
+        self.v_patch_nums: Tuple[int] = v_patch_nums
+        self.quant_resi_ratio = quant_resi
+        if share_quant_resi == 0:  # non-shared: \phi_{1 to K} for K scales
+            self.quant_resi = PhiNonShared(
+                [
+                    (Phi(Cvae, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity())
+                    for _ in range(default_qresi_counts or len(self.v_patch_nums))
+                ]
+            )
+        elif share_quant_resi == 1:  # fully shared: only a single \phi for K scales
+            self.quant_resi = PhiShared(
+                Phi(Cvae, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity()
+            )
+        else:  # partially shared: \phi_{1 to share_quant_resi} for K scales
+            self.quant_resi = PhiPartiallyShared(
+                nn.ModuleList([(
+                    Phi(Cvae, quant_resi)
+                    if abs(quant_resi) > 1e-6
+                    else nn.Identity()
+                ) for _ in range(share_quant_resi)])
+            )
+        self.register_buffer(
+            "ema_vocab_hit_SV",
+            torch.full((len(self.v_patch_nums), self.vocab_size), fill_value=0.0),
+        )
+        self.record_hit = 0
+        self.beta: float = beta
+        self.embedding = nn.Embedding(self.vocab_size, self.Cvae)
+    def eini(self, eini):
+        if eini > 0:
+            nn.init.trunc_normal_(self.embedding.weight.data, std=eini)
+        elif eini < 0:
+            self.embedding.weight.data.uniform_(
+                -abs(eini) / self.vocab_size, abs(eini) / self.vocab_size
+            )
+    def extra_repr(self) -> str:
+        return f"{self.v_patch_nums}, znorm={self.using_znorm}, beta={self.beta}  |  S={len(self.v_patch_nums)}, quant_resi={self.quant_resi_ratio}"
+    # ===================== `forward` is only used in VAE training =====================
+    def forward(
+        self, f_BChw: torch.Tensor, ret_usages=False
+    ) -> Tuple[torch.Tensor, List[float], torch.Tensor]:
+        dtype = f_BChw.dtype
+        if dtype != torch.float32:
+            f_BChw = f_BChw.float()
+        B, C, H, W = f_BChw.shape
+        f_no_grad = f_BChw.detach()
+        f_rest = f_no_grad.clone()
+        f_hat = torch.zeros_like(f_rest)
+        with torch.cuda.amp.autocast(enabled=False):
+            mean_vq_loss: torch.Tensor = 0.0
+            vocab_hit_V = torch.zeros(
+                self.vocab_size, dtype=torch.float, device=f_BChw.device
+            )
+            SN = len(self.v_patch_nums)
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                # find the nearest embedding
+                if self.using_znorm:
+                    rest_NC = (
+                        F.interpolate(f_rest, size=(pn, pn), mode="area")
+                        .permute(0, 2, 3, 1)
+                        .reshape(-1, C)
+                        if (si != SN - 1)
+                        else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+                    )
+                    rest_NC = F.normalize(rest_NC, dim=-1)
+                    idx_N = torch.argmax(
+                        rest_NC @ F.normalize(self.embedding.weight.data.T, dim=0),
+                        dim=1,
+                    )
+                else:
+                    rest_NC = (
+                        F.interpolate(f_rest, size=(pn, pn), mode="area")
+                        .permute(0, 2, 3, 1)
+                        .reshape(-1, C)
+                        if (si != SN - 1)
+                        else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+                    )
+                    d_no_grad = torch.sum(
+                        rest_NC.square(), dim=1, keepdim=True
+                    ) + torch.sum(
+                        self.embedding.weight.data.square(), dim=1, keepdim=False
+                    )
+                    d_no_grad.addmm_(
+                        rest_NC, self.embedding.weight.data.T, alpha=-2, beta=1
+                    )  # (B*h*w, vocab_size)
+                    idx_N = torch.argmin(d_no_grad, dim=1)
+                hit_V = idx_N.bincount(minlength=self.vocab_size).float()
+                if self.training:
+                    # if dist.initialized():
+                    handler = tdist.all_reduce(hit_V, async_op=True)
+                # calc loss
+                idx_Bhw = idx_N.view(B, pn, pn)
+                h_BChw = (
+                    F.interpolate(
+                        self.embedding(idx_Bhw).permute(0, 3, 1, 2),
+                        size=(H, W),
+                        mode="bicubic",
+                    ).contiguous()
+                    if (si != SN - 1)
+                    else self.embedding(idx_Bhw).permute(0, 3, 1, 2).contiguous()
+                )
+                h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+                f_hat = f_hat + h_BChw
+                f_rest -= h_BChw
+                if self.training: # and dist.initialized():
+                    handler.wait()
+                    if self.record_hit == 0:
+                        self.ema_vocab_hit_SV[si].copy_(hit_V)
+                    elif self.record_hit < 100:
+                        self.ema_vocab_hit_SV[si].mul_(0.9).add_(hit_V.mul(0.1))
+                    else:
+                        self.ema_vocab_hit_SV[si].mul_(0.99).add_(hit_V.mul(0.01))
+                    self.record_hit += 1
+                vocab_hit_V.add_(hit_V)
+                mean_vq_loss += F.mse_loss(f_hat.data, f_BChw).mul_(self.beta) + F.mse_loss(f_hat, f_no_grad)
+            mean_vq_loss *= 1.0 / SN
+            f_hat = (f_hat.data - f_no_grad).add_(f_BChw)
+        margin = (
+            tdist.get_world_size()
+            * (f_BChw.numel() / f_BChw.shape[1])
+            / self.vocab_size
+            * 0.08
+        )
+        # margin = pn*pn / 100
+        if ret_usages:
+            usages = [
+                (self.ema_vocab_hit_SV[si] >= margin).float().mean().item() * 100
+                for si, pn in enumerate(self.v_patch_nums)
+            ]
+        else:
+            usages = None
+        return f_hat, usages, mean_vq_loss
+    # ===================== `forward` is only used in VAE training =====================
+    def embed_to_fhat(
+        self, ms_h_BChw: List[torch.Tensor], all_to_max_scale=True, last_one=False
+    ) -> Union[List[torch.Tensor], torch.Tensor]:
+        ls_f_hat_BChw = []
+        B = ms_h_BChw[0].shape[0]
+        H = W = self.v_patch_nums[-1]
+        SN = len(self.v_patch_nums)
+        if all_to_max_scale:
+            f_hat = ms_h_BChw[0].new_zeros(B, self.Cvae, H, W, dtype=torch.float32)
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                h_BChw = ms_h_BChw[si]
+                if si < len(self.v_patch_nums) - 1:
+                    h_BChw = F.interpolate(h_BChw, size=(H, W), mode="bicubic")
+                h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+                f_hat.add_(h_BChw)
+                if last_one:
+                    ls_f_hat_BChw = f_hat
+                else:
+                    ls_f_hat_BChw.append(f_hat.clone())
+        else:
+            # WARNING: this is not the case in VQ-VAE training or inference (we'll interpolate every token map to the max H W, like above)
+            # WARNING: this should only be used for experimental purpose
+            f_hat = ms_h_BChw[0].new_zeros(
+                B,
+                self.Cvae,
+                self.v_patch_nums[0],
+                self.v_patch_nums[0],
+                dtype=torch.float32,
+            )
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                f_hat = F.interpolate(f_hat, size=(pn, pn), mode="bicubic")
+                h_BChw = self.quant_resi[si / (SN - 1)](ms_h_BChw[si])
+                f_hat.add_(h_BChw)
+                if last_one:
+                    ls_f_hat_BChw = f_hat
+                else:
+                    ls_f_hat_BChw.append(f_hat)
+        return ls_f_hat_BChw
+    def f_to_idxBl_or_fhat(
+        self,
+        f_BChw: torch.Tensor,
+        to_fhat: bool,
+        v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None,
+        noise_std: Optional[float] = None,
+    ) -> List[Union[torch.Tensor, torch.LongTensor]]:  # z_BChw is the feature from inp_img_no_grad
+        B, C, H, W = f_BChw.shape
+        f_no_grad = f_BChw.detach()
+        f_rest = f_no_grad.clone()
+        f_hat = torch.zeros_like(f_rest)
+        f_hat_or_idx_Bl: List[torch.Tensor] = []
+        patch_hws = [
+            (pn, pn) if isinstance(pn, int) else (pn[0], pn[1])
+            for pn in (v_patch_nums or self.v_patch_nums)
+        ]  # from small to large
+        assert (
+            patch_hws[-1][0] == H and patch_hws[-1][1] == W
+        ), f"{patch_hws[-1]=} != ({H=}, {W=})"
+        SN = len(patch_hws)
+        for si, (ph, pw) in enumerate(patch_hws):  # from small to large
+            # find the nearest embedding
+            z_NC = (
+                F.interpolate(f_rest, size=(ph, pw), mode="area")
+                .permute(0, 2, 3, 1)
+                .reshape(-1, C)
+                if (si != SN - 1)
+                else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+            )
+            if noise_std is not None:
+                z_NC = math.sqrt(1 - noise_std ** 2) * z_NC + torch.randn_like(z_NC) * noise_std
+            if self.using_znorm:
+                z_NC = F.normalize(z_NC, dim=-1)
+                idx_N = torch.argmax(
+                    z_NC @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1
+                )
+            else:
+                d_no_grad = torch.sum(z_NC.square(), dim=1, keepdim=True) + torch.sum(
+                    self.embedding.weight.data.square(), dim=1, keepdim=False
+                )
+                d_no_grad.addmm_(
+                    z_NC, self.embedding.weight.data.T, alpha=-2, beta=1
+                )  # (B*h*w, vocab_size)
+                idx_N = torch.argmin(d_no_grad, dim=1)
+            idx_Bhw = idx_N.view(B, ph, pw)
+            h_BChw = (
+                F.interpolate(
+                    self.embedding(idx_Bhw).permute(0, 3, 1, 2),
+                    size=(H, W),
+                    mode="bicubic",
+                ).contiguous()
+                if (si != SN - 1)
+                else self.embedding(idx_Bhw).permute(0, 3, 1, 2).contiguous()
+            )
+            h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+            f_hat.add_(h_BChw)
+            f_rest.sub_(h_BChw)
+            f_hat_or_idx_Bl.append(
+                f_hat.clone() if to_fhat else idx_N.reshape(B, ph * pw)
+            )
+        return f_hat_or_idx_Bl
+    # ===================== idxBl_to_switti_input: only used in Switti training, for getting teacher-forcing input =====================
+    def idxBl_to_switti_input(self, gt_ms_idx_Bl: List[torch.Tensor]) -> torch.Tensor:
+        next_scales = []
+        B = gt_ms_idx_Bl[0].shape[0]
+        C = self.Cvae
+        H = W = self.v_patch_nums[-1]
+        SN = len(self.v_patch_nums)
+        f_hat = gt_ms_idx_Bl[0].new_zeros(B, C, H, W, dtype=torch.float32)
+        pn_next: int = self.v_patch_nums[0]
+        for si in range(SN - 1):
+            h_BChw = F.interpolate(
+                self.embedding(gt_ms_idx_Bl[si])
+                .transpose_(1, 2)
+                .view(B, C, pn_next, pn_next),
+                size=(H, W),
+                mode="bicubic",
+            )
+            f_hat.add_(self.quant_resi[si / (SN - 1)](h_BChw))
+            pn_next = self.v_patch_nums[si + 1]
+            next_scales.append(
+                F.interpolate(f_hat, size=(pn_next, pn_next), mode="area")
+                .view(B, C, -1)
+                .transpose(1, 2)
+            )
+        # cat BlCs to BLC, this should be float32
+        return torch.cat(next_scales, dim=1) if len(next_scales) else None
+    # ===================== get_next_autoregressive_input: only used in Switti inference, for getting next step's input =====================
+    def get_next_autoregressive_input(
+        self, si: int, SN: int, f_hat: torch.Tensor, h_BChw: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], torch.Tensor]:  # only used in Switti inference
+        HW = self.v_patch_nums[-1]
+        if si != SN - 1:
+            h = self.quant_resi[si / (SN - 1)](
+                F.interpolate(h_BChw, size=(HW, HW), mode="bicubic")
+            )  # conv after upsample
+            f_hat.add_(h)
+            return f_hat, F.interpolate(
+                f_hat,
+                size=(self.v_patch_nums[si + 1], self.v_patch_nums[si + 1]),
+                mode="area",
+            )
+        else:
+            h = self.quant_resi[si / (SN - 1)](h_BChw)
+            f_hat.add_(h)
+            return f_hat, f_hat
+class Phi(nn.Conv2d):
+    def __init__(self, embed_dim, quant_resi):
+        ks = 3
+        super().__init__(
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            kernel_size=ks,
+            stride=1,
+            padding=ks // 2,
+        )
+        self.resi_ratio = abs(quant_resi)
+    def forward(self, h_BChw):
+        return h_BChw.mul(1 - self.resi_ratio) + super().forward(h_BChw).mul_(
+            self.resi_ratio
+        )
+class PhiShared(nn.Module):
+    def __init__(self, qresi: Phi):
+        super().__init__()
+        self.qresi: Phi = qresi
+    def __getitem__(self, _) -> Phi:
+        return self.qresi
+class PhiPartiallyShared(nn.Module):
+    def __init__(self, qresi_ls: nn.ModuleList):
+        super().__init__()
+        self.qresi_ls = qresi_ls
+        K = len(qresi_ls)
+        self.ticks = (
+            np.linspace(1 / 3 / K, 1 - 1 / 3 / K, K)
+            if K == 4
+            else np.linspace(1 / 2 / K, 1 - 1 / 2 / K, K)
+        )
+    def __getitem__(self, at_from_0_to_1: float) -> Phi:
+        return self.qresi_ls[np.argmin(np.abs(self.ticks - at_from_0_to_1)).item()]
+    def extra_repr(self) -> str:
+        return f"ticks={self.ticks}"
+class PhiNonShared(nn.ModuleList):
+    def __init__(self, qresi: List):
+        super().__init__(qresi)
+        # self.qresi = qresi
+        K = len(qresi)
+        self.ticks = (
+            np.linspace(1 / 3 / K, 1 - 1 / 3 / K, K)
+            if K == 4
+            else np.linspace(1 / 2 / K, 1 - 1 / 2 / K, K)
+        )
+    def __getitem__(self, at_from_0_to_1: float) -> Phi:
+        return super().__getitem__(
+            np.argmin(np.abs(self.ticks - at_from_0_to_1)).item()
+        )
+    def extra_repr(self) -> str:
+        return f"ticks={self.ticks}"

models/rope.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+def init_t_xy(end_x: int, end_y: int):
+    t = torch.arange(end_x * end_y, dtype=torch.float32)
+    t_x = (t % end_x).float()
+    t_y = torch.div(t, end_x, rounding_mode="floor").float()
+    return t_x, t_y
+def compute_axial_cis(
+    dim: int, end_x: int, end_y: int, theta: float = 100.0, norm_coeff: int = 1
+):
+    freqs_x = (
+        1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+        * norm_coeff
+    )
+    freqs_y = (
+        1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+        * norm_coeff
+    )
+    t_x, t_y = init_t_xy(end_x, end_y)
+    freqs_x = torch.outer(t_x, freqs_x)
+    freqs_y = torch.outer(t_y, freqs_y)
+    freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
+    freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
+    return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    freqs_cis = freqs_cis[:, x.shape[1], ...]
+    if freqs_cis.shape == (x.shape[-2], x.shape[-1]):
+        shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
+    elif freqs_cis.shape == (x.shape[-3], x.shape[-2], x.shape[-1]):
+        shape = [d if i >= ndim - 3 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor):
+    with torch.cuda.amp.autocast(enabled=False):
+        x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
+        # freqs_cis = reshape_for_broadcast(freqs_cis, x).to(x_in.device)
+        freqs_cis = freqs_cis[None, :, : x.shape[2], ...].to(x_in.device)
+        x_out = torch.view_as_real(x * freqs_cis).flatten(3)
+        return x_out.type_as(x_in)

models/switti.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import math
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from diffusers.models.embeddings import GaussianFourierProjection
+from models.basic_switti import AdaLNBeforeHead, AdaLNSelfCrossAttn
+from models.rope import compute_axial_cis
+def get_crop_condition(
+    heights: list,
+    widths: list,
+    base_size=512
+):
+    if type(heights[0]) == type(widths[0]) == str:
+        heights = [int(h) for h in heights]
+        widths = [int(w) for w in widths]
+    h = torch.tensor(heights, dtype=torch.int).unsqueeze(1)
+    w = torch.tensor(widths, dtype=torch.int).unsqueeze(1)
+    hw = torch.cat([h, w], dim=1)
+    ratio = base_size / hw.min(-1)[0]
+    orig_size = (hw * ratio[:, None]).to(torch.int)
+    crop_coords = ((orig_size - base_size) // 2).clamp(min=0)
+    crop_cond = torch.cat([orig_size, crop_coords], dim=1)
+    return crop_cond
+class Switti(nn.Module):
+    def __init__(
+        self,
+        Cvae=32,
+        V=4096,
+        rope=True,
+        rope_theta=10000,
+        rope_size=128,
+        depth=16,
+        embed_dim=1024,
+        num_heads=16,
+        mlp_ratio=4.0,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_eps=1e-6,
+        attn_l2_norm=True,
+        patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),  # 10 steps by default
+        fused_if_available=True,
+        use_swiglu_ffn=True,
+        use_ar=False,
+        use_crop_cond=True,
+    ):
+        super().__init__()
+        # 0. hyperparameters
+        assert embed_dim % num_heads == 0
+        self.depth, self.C, self.D, self.num_heads = (
+            depth,
+            embed_dim,
+            embed_dim,
+            num_heads,
+        )
+        self.Cvae, self.V = Cvae, V
+        self.patch_nums: Tuple[int] = patch_nums
+        self.L = sum(pn**2 for pn in self.patch_nums)
+        self.first_l = self.patch_nums[0] ** 2
+        self.rope = rope
+        self.num_stages_minus_1 = len(self.patch_nums) - 1
+        self.rng = torch.Generator(device="cuda")
+        # 1. input (word) embedding
+        self.word_embed = nn.Linear(self.Cvae, self.C)
+        # 2. text embedding
+        self.pooled_embed_size = 1280
+        self.context_dim = 1280 + 768
+        self.text_pooler = nn.Linear(self.pooled_embed_size, self.D)
+        init_std = math.sqrt(1 / self.C / 3)
+        self.pos_start = nn.Parameter(torch.empty(1, self.first_l, self.C))
+        nn.init.trunc_normal_(self.pos_start.data, mean=0, std=init_std)
+        # 3. position embedding
+        if not self.rope:
+            # absolute position embedding
+            pos_1LC = []
+            for i, pn in enumerate(self.patch_nums):
+                pe = torch.empty(1, pn * pn, self.C)
+                nn.init.trunc_normal_(pe, mean=0, std=init_std)
+                pos_1LC.append(pe)
+            pos_1LC = torch.cat(pos_1LC, dim=1)  # 1, L, C
+            assert tuple(pos_1LC.shape) == (1, self.L, self.C)
+            self.pos_1LC = nn.Parameter(pos_1LC)
+            self.freqs_cis = None
+        else:
+            # RoPE position embedding
+            assert (
+                self.C // self.num_heads
+            ) % 4 == 0, "2d rope needs head dim to be divisible by 4"
+            patch_nums_m1 = tuple(pn - 1 if pn > 1 else 1 for pn in self.patch_nums)
+            self.compute_cis = partial(compute_axial_cis, dim=self.C // self.num_heads)
+            freqs_cis = []
+            for i, pn in enumerate(self.patch_nums):
+                norm_coeff = rope_size / patch_nums_m1[i]
+                cur_freqs = self.compute_cis(
+                    end_x=pn, end_y=pn, theta=rope_theta, norm_coeff=norm_coeff
+                )
+                freqs_cis.append(cur_freqs[None, ...])
+            self.freqs_cis = torch.cat(freqs_cis, dim=1)  # 1, L, C // 2 -- complex
+        # level embedding (similar to GPT's segment embedding,
+        # used to distinguish different levels of token pyramid)
+        self.lvl_embed = nn.Embedding(len(self.patch_nums), self.C)
+        nn.init.trunc_normal_(self.lvl_embed.weight.data, mean=0, std=init_std)
+        # 4. backbone blocks
+        self.drop_path_rate = drop_path_rate
+        # stochastic depth decay rule (linearly increasing)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList([])
+        for block_idx in range(depth):
+            self.blocks.append(
+                AdaLNSelfCrossAttn(
+                    cond_dim=self.D,
+                    block_idx=block_idx,
+                    embed_dim=self.C,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[block_idx],
+                    last_drop_p=0 if block_idx == 0 else dpr[block_idx - 1],
+                    qk_norm=attn_l2_norm,
+                    context_dim=self.context_dim,
+                    use_swiglu_ffn=use_swiglu_ffn,
+                    norm_eps=norm_eps,
+                    use_crop_cond=use_crop_cond,
+                )
+            )
+        fused_add_norm_fns = [b.fused_add_norm_fn is not None for b in self.blocks]
+        self.using_fused_add_norm_fn = any(fused_add_norm_fns)
+        print(
+            f"\n[constructor]  ==== fused_if_available={fused_if_available} "
+            f"(fusing_add_ln={sum(fused_add_norm_fns)}/{self.depth}, "
+            f"fusing_mlp={sum(b.ffn.fused_mlp_func is not None for b in self.blocks)}/{self.depth}) ==== \n"
+            f"    [Switti config ] embed_dim={embed_dim}, num_heads={num_heads}, "
+            f"depth={depth}, mlp_ratio={mlp_ratio}\n"
+            f"    [drop ratios ] drop_rate={drop_rate}, attn_drop_rate={attn_drop_rate}, "
+            f"drop_path_rate={drop_path_rate:g} ({torch.linspace(0, drop_path_rate, depth)})",
+            end="\n\n",
+            flush=True,
+        )
+        # Prepare crop condition embedder
+        self.use_crop_cond = use_crop_cond
+        if use_crop_cond:
+            # crop condition is repredsented with 4 int values. each is embeded to self.D // 4 dim
+            assert self.D % 8 == 0
+            self.crop_embed = GaussianFourierProjection(
+                self.D // 2 // 4, set_W_to_weight=False, log=False, flip_sin_to_cos=False
+            )
+            self.crop_proj = nn.Linear(self.D, self.D)
+        # 5. attention mask used in training (for masking out the future)
+        #    it won't be used in inference, since kv cache is enabled
+        self.use_ar = use_ar
+        d: torch.Tensor = torch.cat(
+            [torch.full((pn * pn,), i) for i, pn in enumerate(self.patch_nums)]
+        ).view(1, self.L, 1)
+        dT = d.transpose(1, 2)  # dT: 11L
+        lvl_1L = dT[:, 0].contiguous()
+        self.register_buffer("lvl_1L", lvl_1L)
+        if self.use_ar:
+            attn_bias_for_masking = torch.where(d >= dT, 0.0, -torch.inf)
+        else:
+            attn_bias_for_masking = torch.where(d == dT, 0.0, -torch.inf)
+        attn_bias_for_masking = attn_bias_for_masking.reshape(1, 1, self.L, self.L)
+        self.register_buffer(
+            "attn_bias_for_masking", attn_bias_for_masking.contiguous()
+        )
+        # 6. classifier head
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        self.head_nm = AdaLNBeforeHead(self.C, self.D, norm_layer=norm_layer)
+        self.head = nn.Linear(self.C, self.V)
+        # By default disable gradient checkpointing
+        self.use_gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = False
+    def get_logits(
+        self,
+        h_or_h_and_residual: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        cond_BD: Optional[torch.Tensor],
+    ):
+        if not isinstance(h_or_h_and_residual, torch.Tensor):
+            h, resi = h_or_h_and_residual  # fused_add_norm must be used
+            h = resi + self.blocks[-1].drop_path(h)
+        else:  # fused_add_norm is not used
+            h = h_or_h_and_residual
+        return self.head(self.head_nm(h, cond_BD))
+    def forward(
+        self,
+        x_BLCv_wo_first_l: torch.Tensor,
+        prompt_embeds: torch.Tensor,
+        pooled_prompt_embeds: torch.Tensor,
+        prompt_attn_bias: torch.Tensor,
+        batch_height: list[int] | None = None,
+        batch_width: list[int] | None = None,
+    ) -> torch.Tensor:  # returns logits_BLV
+        """
+        :param x_BLCv_wo_first_l: teacher forcing input (B, self.L-self.first_l, self.Cvae)
+        :param prompt_embeds (B, context_len, self.context_dim):
+        text features from pipe.text_encoder and pipe.text_encoder_2,
+        concatenated along dim=-1, padded to longest along dim=1
+        :param pooled_prompt_embeds (B, self.pooled_embed_size):
+        pooled text features from pipe.text_encoder_2
+        :param prompt_attn_bias (B, context_len):
+        boolean mask to specify which tokens are not padding
+        :param batch_height (B,): original height of images in a batch.
+        :param batch_width (B,): original width of images in a batch.
+        Only used when self.use_crop_cond = True
+        :return: logits BLV, V is vocab_size
+        """
+        bg, ed = 0, self.L
+        B = x_BLCv_wo_first_l.shape[0]
+        with torch.amp.autocast('cuda', enabled=False):
+            pooled_prompt_embeds = self.text_pooler(pooled_prompt_embeds)
+            sos = cond_BD = pooled_prompt_embeds
+            sos = sos.unsqueeze(1).expand(B, self.first_l, -1) + self.pos_start.expand(
+                B, self.first_l, -1
+            )
+            x_BLC = torch.cat(
+                (sos, self.word_embed(x_BLCv_wo_first_l.float())), dim=1
+            )
+            x_BLC += self.lvl_embed(
+                self.lvl_1L[:, :ed].expand(B, -1)
+            )  # lvl: BLC;  pos: 1LC
+            if not self.rope:
+                x_BLC += self.pos_1LC[:, :ed]
+        attn_bias = self.attn_bias_for_masking[:, :, :ed, :ed]
+        if self.use_crop_cond:
+            crop_coords = get_crop_condition(batch_height, batch_width).to(cond_BD.device)
+            crop_embed = self.crop_embed(crop_coords.view(-1)).reshape(B, self.D)
+            crop_cond = self.crop_proj(crop_embed)
+        else:
+            crop_cond = None
+        # hack: get the dtype if mixed precision is used
+        temp = x_BLC.new_ones(8, 8)
+        main_type = torch.matmul(temp, temp).dtype
+        x_BLC = x_BLC.to(dtype=main_type)
+        cond_BD = cond_BD.to(dtype=main_type)
+        attn_bias = attn_bias.to(dtype=main_type)
+        for block in self.blocks:
+            if self.use_gradient_checkpointing:
+                x_BLC = torch.utils.checkpoint.checkpoint(
+                    block,
+                    x=x_BLC,
+                    cond_BD=cond_BD,
+                    attn_bias=attn_bias,
+                    context=prompt_embeds,
+                    freqs_cis=self.freqs_cis,
+                    context_attn_bias=prompt_attn_bias,
+                    crop_cond=crop_cond,
+                    use_reentrant=False,
+                )
+            else:
+                x_BLC = block(
+                    x=x_BLC,
+                    cond_BD=cond_BD,
+                    attn_bias=attn_bias,
+                    context=prompt_embeds,
+                    freqs_cis=self.freqs_cis,
+                    context_attn_bias=prompt_attn_bias,
+                    crop_cond=crop_cond,
+                )
+        with torch.amp.autocast('cuda', enabled=not self.training):
+            x_BLC = self.get_logits(x_BLC, cond_BD.float())
+        return x_BLC  # logits BLV, V is vocab_size
+    def init_weights(
+        self,
+        init_adaln=0.5,
+        init_adaln_gamma=1e-5,
+        init_head=0.02,
+        init_std=0.02,
+    ):
+        if init_std < 0:
+            init_std = (1 / self.C / 3) ** 0.5  # init_std < 0: automated
+        print(f"[init_weights] {type(self).__name__} with {init_std=:g}")
+        for m in self.modules():
+            with_weight = hasattr(m, "weight") and m.weight is not None
+            with_bias = hasattr(m, "bias") and m.bias is not None
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight.data, std=init_std)
+                if with_bias:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.Embedding):
+                nn.init.trunc_normal_(m.weight.data, std=init_std)
+                if m.padding_idx is not None:
+                    m.weight.data[m.padding_idx].zero_()
+            elif isinstance(
+                m,
+                (
+                    nn.LayerNorm,
+                    nn.BatchNorm1d,
+                    nn.BatchNorm2d,
+                    nn.BatchNorm3d,
+                    nn.SyncBatchNorm,
+                    nn.GroupNorm,
+                    nn.InstanceNorm1d,
+                    nn.InstanceNorm2d,
+                    nn.InstanceNorm3d,
+                ),
+            ):
+                if with_weight:
+                    m.weight.data.fill_(1.0)
+                if with_bias:
+                    m.bias.data.zero_()
+        if init_head >= 0:
+            if isinstance(self.head, nn.Linear):
+                self.head.weight.data.mul_(init_head)
+                self.head.bias.data.zero_()
+            elif isinstance(self.head, nn.Sequential):
+                self.head[-1].weight.data.mul_(init_head)
+                self.head[-1].bias.data.zero_()
+        if isinstance(self.head_nm, AdaLNBeforeHead):
+            self.head_nm.ada_lin[-1].weight.data.mul_(init_adaln)
+            if (
+                hasattr(self.head_nm.ada_lin[-1], "bias")
+                and self.head_nm.ada_lin[-1].bias is not None
+            ):
+                self.head_nm.ada_lin[-1].bias.data.zero_()
+        depth = len(self.blocks)
+        for block in self.blocks:
+            block.attn.proj.weight.data.div_(math.sqrt(2 * depth))
+            block.cross_attn.proj.weight.data.div_(math.sqrt(2 * depth))
+            if hasattr(block.ffn, "fc2"):
+                block.ffn.fc2.weight.data.div_(math.sqrt(2 * depth))
+            if hasattr(block, "ada_lin"):
+                block.ada_lin[-1].weight.data[2 * self.C :].mul_(init_adaln)
+                block.ada_lin[-1].weight.data[: 2 * self.C].mul_(init_adaln_gamma)
+                if (
+                    hasattr(block.ada_lin[-1], "bias")
+                    and block.ada_lin[-1].bias is not None
+                ):
+                    block.ada_lin[-1].bias.data.zero_()
+            elif hasattr(block, "ada_gss"):
+                block.ada_gss.data[:, :, 2:].mul_(init_adaln)
+                block.ada_gss.data[:, :, :2].mul_(init_adaln_gamma)
+    def extra_repr(self):
+        return f"drop_path_rate={self.drop_path_rate:g}"
+class SwittiHF(Switti, PyTorchModelHubMixin):
+    # tags=["image-generation"]):
+    def __init__(
+        self,
+        depth=30,
+        rope=True,
+        rope_theta=10000,
+        rope_size=128,
+        use_swiglu_ffn=True,
+        use_ar=False,
+        use_crop_cond=True,
+    ):
+        heads = depth
+        width = depth * 64
+        super().__init__(
+            depth=depth,
+            embed_dim=width,
+            num_heads=heads,
+            patch_nums=(1, 2, 3, 4, 6, 9, 13, 18, 24, 32),
+            rope=rope,
+            rope_theta=rope_theta,
+            rope_size=rope_size,
+            use_swiglu_ffn=use_swiglu_ffn,
+            use_ar=use_ar,
+            use_crop_cond=use_crop_cond,
+        )

models/vqvae.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+References:
+- VectorQuantizer2: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/modules/vqvae/quantize.py#L110
+- GumbelQuantize: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/modules/vqvae/quantize.py#L213
+- VQVAE (VQModel): https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/models/autoencoder.py#L14
+"""
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from .basic_vae import Decoder, Encoder
+from .quant import VectorQuantizer2
+class VQVAE(nn.Module):
+    def __init__(
+        self,
+        vocab_size=4096,
+        z_channels=32,
+        ch=160,
+        dropout=0.0,
+        beta=0.25,  # commitment loss weight
+        using_znorm=False,  # whether to normalize when computing the nearest neighbors
+        quant_conv_ks=3,  # quant conv kernel size
+        quant_resi=0.5,  # 0.5 means \phi(x) = 0.5conv(x) + (1-0.5)x
+        share_quant_resi=4,  # use 4 \phi layers for K scales: partially-shared \phi
+        default_qresi_counts=0,  # if is 0: automatically set to len(v_patch_nums)
+        # number of patches for each scale, h_{1 to K} = w_{1 to K} = v_patch_nums[k]
+        v_patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),
+        test_mode=True,
+    ):
+        super().__init__()
+        self.test_mode = test_mode
+        self.V, self.Cvae = vocab_size, z_channels
+        # ddconfig is copied from https://github.com/CompVis/latent-diffusion/blob/e66308c7f2e64cb581c6d27ab6fbeb846828253b/models/first_stage_models/vq-f16/config.yaml
+        ddconfig = dict(
+            dropout=dropout,
+            ch=ch,
+            z_channels=z_channels,
+            in_channels=3,
+            ch_mult=(1, 1, 2, 2, 4),
+            num_res_blocks=2,  # from vq-f16/config.yaml above
+            using_sa=True,
+            using_mid_sa=True,  # from vq-f16/config.yaml above
+            # resamp_with_conv=True,   # always True, removed.
+        )
+        ddconfig.pop("double_z", None)  # only KL-VAE should use double_z=True
+        self.encoder = Encoder(double_z=False, **ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.vocab_size = vocab_size
+        self.downsample = 2 ** (len(ddconfig["ch_mult"]) - 1)
+        self.quantize: VectorQuantizer2 = VectorQuantizer2(
+            vocab_size=vocab_size,
+            Cvae=self.Cvae,
+            using_znorm=using_znorm,
+            beta=beta,
+            default_qresi_counts=default_qresi_counts,
+            v_patch_nums=v_patch_nums,
+            quant_resi=quant_resi,
+            share_quant_resi=share_quant_resi,
+        )
+        self.quant_conv = torch.nn.Conv2d(
+            self.Cvae, self.Cvae, quant_conv_ks, stride=1, padding=quant_conv_ks // 2
+        )
+        self.post_quant_conv = torch.nn.Conv2d(
+            self.Cvae, self.Cvae, quant_conv_ks, stride=1, padding=quant_conv_ks // 2
+        )
+        if self.test_mode:
+            self.eval()
+            [p.requires_grad_(False) for p in self.parameters()]
+    # ===================== `forward` is only used in VAE training =====================
+    def forward(self, inp, ret_usages=False):  # -> rec_B3HW, idx_N, loss
+        VectorQuantizer2.forward
+        f_hat, usages, vq_loss = self.quantize(
+            self.quant_conv(self.encoder(inp)), ret_usages=ret_usages
+        )
+        return self.decoder(self.post_quant_conv(f_hat)), usages, vq_loss
+    # ===================== `forward` is only used in VAE training =====================
+    def fhat_to_img(self, f_hat: torch.Tensor):
+        return self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1)
+    def img_to_idxBl(
+        self,
+        inp_img_no_grad: torch.Tensor,
+        v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None,
+        noise_std: Optional[float] = None,
+    ) -> List[torch.LongTensor]:  # return List[Bl]
+        f = self.quant_conv(self.encoder(inp_img_no_grad))
+        return self.quantize.f_to_idxBl_or_fhat(
+            f, to_fhat=False, v_patch_nums=v_patch_nums, noise_std=noise_std,
+        )
+    def idxBl_to_img(
+        self, ms_idx_Bl: List[torch.Tensor], same_shape: bool, last_one=False
+    ) -> Union[List[torch.Tensor], torch.Tensor]:
+        B = ms_idx_Bl[0].shape[0]
+        ms_h_BChw = []
+        for idx_Bl in ms_idx_Bl:
+            l = idx_Bl.shape[1]
+            pn = round(l**0.5)
+            ms_h_BChw.append(
+                self.quantize.embedding(idx_Bl)
+                .transpose(1, 2)
+                .view(B, self.Cvae, pn, pn)
+            )
+        return self.embed_to_img(
+            ms_h_BChw=ms_h_BChw, all_to_max_scale=same_shape, last_one=last_one
+        )
+    def embed_to_img(
+        self, ms_h_BChw: List[torch.Tensor], all_to_max_scale: bool, last_one=False
+    ) -> Union[List[torch.Tensor], torch.Tensor]:
+        if last_one:
+            return self.decoder(
+                self.post_quant_conv(
+                    self.quantize.embed_to_fhat(
+                        ms_h_BChw, all_to_max_scale=all_to_max_scale, last_one=True
+                    )
+                )
+            ).clamp_(-1, 1)
+        else:
+            return [
+                self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1)
+                for f_hat in self.quantize.embed_to_fhat(
+                    ms_h_BChw, all_to_max_scale=all_to_max_scale, last_one=False
+                )
+            ]
+    def img_to_reconstructed_img(
+        self,
+        x,
+        v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None,
+        last_one=False,
+    ) -> List[torch.Tensor]:
+        f = self.quant_conv(self.encoder(x))
+        ls_f_hat_BChw = self.quantize.f_to_idxBl_or_fhat(
+            f, to_fhat=True, v_patch_nums=v_patch_nums
+        )
+        if last_one:
+            return self.decoder(self.post_quant_conv(ls_f_hat_BChw[-1])).clamp_(-1, 1)
+        else:
+            return [
+                self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1)
+                for f_hat in ls_f_hat_BChw
+            ]
+    def load_state_dict(self, state_dict: Dict[str, Any], strict=True, assign=False):
+        if (
+            "quantize.ema_vocab_hit_SV" in state_dict
+            and state_dict["quantize.ema_vocab_hit_SV"].shape[0]
+            != self.quantize.ema_vocab_hit_SV.shape[0]
+        ):
+            state_dict["quantize.ema_vocab_hit_SV"] = self.quantize.ema_vocab_hit_SV
+        return super().load_state_dict(
+            state_dict=state_dict, strict=strict, assign=assign
+        )
+class VQVAEHF(VQVAE, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        vocab_size=4096,
+        z_channels=32,
+        ch=160,
+        test_mode=True,
+        share_quant_resi=4,
+        v_patch_nums=(1, 2, 3, 4, 6, 9, 13, 18, 24, 32),
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            z_channels=z_channels,
+            ch=ch,
+            test_mode=test_mode,
+            share_quant_resi=share_quant_resi,
+            v_patch_nums=v_patch_nums,
+        )

requirements.txt CHANGED Viewed

@@ -1,6 +1,16 @@
-accelerate
-diffusers
-invisible_watermark
-torch
-transformers
-xformers

+huggingface_hub==0.26.2
+transformers==4.45.2
+diffusers==0.31.0
+einops==0.8.0
+pytz==2024.2
+wandb==0.18.7
+torch==2.4.1
+decord==0.6.0
+numpy==2.1.2
+Pillow==11.0.0
+pytz==2024.2
+scipy==1.14.1
+torchvision==0.19.1
+tqdm==4.66.5
+gradio==5.7.1
+spaces==0.30.4