Spaces:

Alpha-VLLM
/

Lumina-Next-T2I

Running on Zero

App Files Files Community

PommesPeter commited on May 13

Commit

5aadc08

•

1 Parent(s): 24e677f

Upload 8 files

Browse files

Files changed (6) hide show

app.py +598 -0
models/__init__.py +2 -0
models/components.py +54 -0
models/model.py +908 -0
models/model_5b.py +894 -0
requirements.txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,598 @@

+import argparse
+import builtins
+import json
+import multiprocessing as mp
+import os, sys
+import random
+import socket
+import traceback
+import fairscale.nn.model_parallel.initialize as fs_init
+import gradio as gr
+import numpy as np
+import torch
+import torch.distributed as dist
+from torchvision.transforms.functional import to_pil_image
+import models
+from PIL import Image
+from lumina_t2i.transport import create_transport, Sampler
+description = """
+    # Lumina Next Text-to-Image
+    Lumina-Next-T2I is a 2B Next-DiT model with 2B text encoder.
+    Demo current model: `Lumina-Next-T2I`
+    ### <span style='color: red;'>Due to the high volume of access, we have temporarily disabled the resolution extrapolation functionality.
+    ### Additionally, we offer three alternative links for Lumina-T2X access. Try to visit other demo sites. [[demo1](http://106.14.2.150:10022/)] [[demo2](http://106.14.2.150:10023/)]
+"""
+examples =  [
+    ["👽🤖👹👻"],
+    ["孤舟蓑笠翁"],
+    ["两只黄鹂鸣翠柳"],
+    ["大漠孤烟直，长河落日圆"],
+    ["秋风起兮白云飞，草木黄落兮雁南归"],
+    ["도쿄 타워, 최고 품질의 우키요에, 에도 시대"],
+    ["味噌ラーメン, 最高品質の浮世絵、江戸時代。"],
+    ["東京タワー、最高品質の浮世絵、江戸時代。"],
+    ["Astronaut on Mars During sunset"],
+    ["Tour de Tokyo, estampes ukiyo-e de la plus haute qualité, période Edo"],
+    ["🐔 playing 🏀"],
+    ["☃️ with 🌹 in the ❄️"],
+    ["🐶 wearing 😎  flying on 🌈 "],
+    ["A small 🍎 and 🍊 with 😁 emoji in the Sahara desert"],
+    ["Токийская башня, лучшие укиё-э, период Эдо"],
+    ["Tokio-Turm, hochwertigste Ukiyo-e, Edo-Zeit"],
+    ["A scared cute rabbit in Happy Tree Friends style and punk vibe."],  # noqa
+    ["A humanoid eagle soldier of the First World War."],  # noqa
+    ["A cute Christmas mockup on an old wooden industrial desk table with Christmas decorations and bokeh lights in the background."],
+    ["A front view of a romantic flower shop in France filled with various blooming flowers including lavenders and roses."],
+    ["An old man, portrayed as a retro superhero, stands in the streets of New York City at night"],
+    ["many trees are surrounded by a lake in autumn colors, in the style of nature-inspired imagery, havencore, brightly colored, dark white and dark orange, bright primary colors, environmental activism, forestpunk --ar 64:51"],
+    ["A fluffy mouse holding a watermelon, in a magical and colorful setting, illustrated in the style of Hayao Miyazaki anime by Studio Ghibli."],
+    ["Inka warrior with a war make up, medium shot, natural light, Award winning wildlife photography, hyperrealistic, 8k resolution, --ar 9:16"],
+    ["Character of lion in style of saiyan, mafia, gangsta, citylights background, Hyper detailed, hyper realistic, unreal engine ue5, cgi 3d, cinematic shot, 8k"],
+    ["In the sky above, a giant, whimsical cloud shaped like the 😊 emoji casts a soft, golden light over the scene"],
+    ["Cyberpunk eagle, neon ambiance, abstract black oil, gear mecha, detailed acrylic, grunge, intricate complexity, rendered in unreal engine 5, photorealistic, 8k"],
+    ["close-up photo of a beautiful red rose breaking through a cube made of ice , splintered cracked ice surface, frosted colors, blood dripping from rose, melting ice, Valentine’s Day vibes, cinematic, sharp focus, intricate, cinematic, dramatic light"],
+    ["3D cartoon Fox Head with Human Body, Wearing Iridescent Holographic Liquid Texture & Translucent Material Sun Protective Shirt, Boss Feel, Nike or Addidas Sun Protective Shirt, WitchPunk, Y2K Style, Green and blue, Blue, Metallic Feel, Strong Reflection, plain background, no background, pure single color background, Digital Fashion, Surreal Futurism, Supreme Kong NFT Artwork Style, disney style, headshot photography for portrait studio shoot, fashion editorial aesthetic, high resolution in the style of HAPE PRIME NFT, NFT 3D IP Feel, Bored Ape Yacht Club NFT project Feel, high detail, fine luster, 3D render, oc render, best quality, 8K, bright, front lighting, Face Shot, fine luster, ultra detailed"],
+],
+class ModelFailure:
+    pass
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(
+    prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train=True
+):
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+    with torch.no_grad():
+        text_inputs = tokenizer(
+            captions,
+            padding=True,
+            pad_to_multiple_of=8,
+            max_length=256,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_masks = text_inputs.attention_mask
+        prompt_embeds = text_encoder(
+            input_ids=text_input_ids.cuda(),
+            attention_mask=prompt_masks.cuda(),
+            output_hidden_states=True,
+        ).hidden_states[-2]
+    return prompt_embeds, prompt_masks
+@torch.no_grad()
+def model_main(args, master_port, rank, request_queue, response_queue, mp_barrier):
+    # import here to avoid huggingface Tokenizer parallelism warnings
+    from diffusers.models import AutoencoderKL
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    # override the default print function since the delay can be large for child process
+    original_print = builtins.print
+    # Redefine the print function with flush=True by default
+    def print(*args, **kwargs):
+        kwargs.setdefault("flush", True)
+        original_print(*args, **kwargs)
+    # Override the built-in print with the new version
+    builtins.print = print
+    os.environ["MASTER_PORT"] = str(master_port)
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(args.num_gpus)
+    dist.init_process_group("nccl")
+    # set up fairscale environment because some methods of the Lumina model need it,
+    # though for single-GPU inference fairscale actually has no effect
+    fs_init.initialize_model_parallel(args.num_gpus)
+    torch.cuda.set_device(rank)
+    train_args = torch.load(os.path.join(args.ckpt, "model_args.pth"))
+    if dist.get_rank() == 0:
+        print("Loaded model arguments:", json.dumps(train_args.__dict__, indent=2))
+    if dist.get_rank() == 0:
+        print(f"Creating lm: Gemma-2B")
+    dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[
+        args.precision
+    ]
+    text_encoder = (
+        AutoModelForCausalLM.from_pretrained(
+            "google/gemma-2b", torch_dtype=dtype, device_map="cuda"
+        )
+        .get_decoder()
+        .eval()
+    )
+    cap_feat_dim = text_encoder.config.hidden_size
+    if args.num_gpus > 1:
+        raise NotImplementedError("Inference with >1 GPUs not yet supported")
+    tokenizer = AutoTokenizer.from_pretrained(
+        "google/gemma-2b", add_bos_token=True, add_eos_token=True
+    )
+    tokenizer.padding_side = "right"
+    if dist.get_rank() == 0:
+        print(f"Creating vae: sdxl-vae")
+    vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae",
+        torch_dtype=torch.float32,
+    ).cuda()
+    if dist.get_rank() == 0:
+        print(f"Creating DiT: Next-DiT")
+    # latent_size = train_args.image_size // 8
+    model = models.__dict__["DiT_Llama_2B_patch2"](
+        qk_norm=train_args.qk_norm,
+        cap_feat_dim=cap_feat_dim,
+    )
+    model.eval().to("cuda", dtype=dtype)
+    assert train_args.model_parallel_size == args.num_gpus
+    if args.ema:
+        print("Loading ema model.")
+    ckpt = torch.load(
+        os.path.join(
+            args.ckpt,
+            f"consolidated{'_ema' if args.ema else ''}.{rank:02d}-of-{args.num_gpus:02d}.pth",
+        ),
+        map_location="cpu",
+    )
+    model.load_state_dict(ckpt, strict=True)
+    mp_barrier.wait()
+    with torch.autocast("cuda", dtype):
+        while True:
+            (
+                cap,
+                resolution,
+                num_sampling_steps,
+                cfg_scale,
+                solver,
+                t_shift,
+                seed,
+                ntk_scaling,
+                proportional_attn,
+            ) = request_queue.get()
+            print(
+                "> params:",
+                cap,
+                resolution,
+                num_sampling_steps,
+                cfg_scale,
+                solver,
+                t_shift,
+                seed,
+                ntk_scaling,
+                proportional_attn,
+            )
+            try:
+                # begin sampler
+                transport = create_transport(
+                    args.path_type,
+                    args.prediction,
+                    args.loss_weight,
+                    args.train_eps,
+                    args.sample_eps,
+                )
+                sampler = Sampler(transport)
+                if args.sampler_mode == "ODE":
+                    if args.likelihood:
+                        # assert args.cfg_scale == 1, "Likelihood is incompatible with guidance"  # todo
+                        sample_fn = sampler.sample_ode_likelihood(
+                            sampling_method=solver,
+                            num_steps=num_sampling_steps,
+                            atol=args.atol,
+                            rtol=args.rtol,
+                        )
+                    else:
+                        sample_fn = sampler.sample_ode(
+                            sampling_method=solver,
+                            num_steps=num_sampling_steps,
+                            atol=args.atol,
+                            rtol=args.rtol,
+                            reverse=args.reverse,
+                            time_shifting_factor=t_shift,
+                        )
+                elif args.sampler_mode == "SDE":
+                    sample_fn = sampler.sample_sde(
+                        sampling_method=solver,
+                        diffusion_form=args.diffusion_form,
+                        diffusion_norm=args.diffusion_norm,
+                        last_step=args.last_step,
+                        last_step_size=args.last_step_size,
+                        num_steps=num_sampling_steps,
+                    )
+                # end sampler
+                resolution = resolution.split(" ")[-1]
+                w, h = resolution.split("x")
+                w, h = int(w), int(h)
+                latent_w, latent_h = w // 8, h // 8
+                if int(seed) != 0:
+                    torch.random.manual_seed(int(seed))
+                z = torch.randn([1, 4, latent_h, latent_w], device="cuda").to(dtype)
+                z = z.repeat(2, 1, 1, 1)
+                with torch.no_grad():
+                    cap_feats, cap_mask = encode_prompt(
+                        [cap] + [""], text_encoder, tokenizer, 0.0
+                    )
+                cap_mask = cap_mask.to(cap_feats.device)
+                train_res = 1024
+                res_cat = (w * h) ** 0.5
+                print(f"res_cat: {res_cat}")
+                max_seq_len = (res_cat // 16) ** 2 + (res_cat // 16) * 2
+                print(f"max_seq_len: {max_seq_len}")
+                rope_scaling_factor = 1.0
+                ntk_factor = max_seq_len / (train_res // 16) ** 2
+                print(f"ntk_factor: {ntk_factor}")
+                model_kwargs = dict(
+                    cap_feats=cap_feats,
+                    cap_mask=cap_mask,
+                    cfg_scale=cfg_scale,
+                    rope_scaling_factor=rope_scaling_factor,
+                    ntk_factor=ntk_factor,
+                )
+                if dist.get_rank() == 0:
+                    print(f"caption: {cap}")
+                    print(f"num_sampling_steps: {num_sampling_steps}")
+                    print(f"cfg_scale: {cfg_scale}")
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    print("> [debug] start sample")
+                    samples = sample_fn(z, model.forward_with_cfg, **model_kwargs)[-1]
+                samples = samples[:1]
+                factor = 0.18215 if train_args.vae != "sdxl" else 0.13025
+                print(f"vae factor: {factor}")
+                samples = vae.decode(samples / factor).sample
+                samples = (samples + 1.0) / 2.0
+                samples.clamp_(0.0, 1.0)
+                img = to_pil_image(samples[0].float())
+                if response_queue is not None:
+                    response_queue.put(img)
+            except Exception:
+                print(traceback.format_exc())
+                response_queue.put(ModelFailure())
+def none_or_str(value):
+    if value == "None":
+        return None
+    return value
+def parse_transport_args(parser):
+    group = parser.add_argument_group("Transport arguments")
+    group.add_argument(
+        "--path-type",
+        type=str,
+        default="Linear",
+        choices=["Linear", "GVP", "VP"],
+        help="the type of path for transport: 'Linear', 'GVP' (Geodesic Vector Pursuit), or 'VP' (Vector Pursuit).",
+    )
+    group.add_argument(
+        "--prediction",
+        type=str,
+        default="velocity",
+        choices=["velocity", "score", "noise"],
+        help="the prediction model for the transport dynamics.",
+    )
+    group.add_argument(
+        "--loss-weight",
+        type=none_or_str,
+        default=None,
+        choices=[None, "velocity", "likelihood"],
+        help="the weighting of different components in the loss function, can be 'velocity' for dynamic modeling, 'likelihood' for statistical consistency, or None for no weighting.",
+    )
+    group.add_argument(
+        "--sample-eps", type=float, help="sampling in the transport model."
+    )
+    group.add_argument(
+        "--train-eps", type=float, help="training to stabilize the learning process."
+    )
+def parse_ode_args(parser):
+    group = parser.add_argument_group("ODE arguments")
+    group.add_argument(
+        "--atol",
+        type=float,
+        default=1e-6,
+        help="Absolute tolerance for the ODE solver.",
+    )
+    group.add_argument(
+        "--rtol",
+        type=float,
+        default=1e-3,
+        help="Relative tolerance for the ODE solver.",
+    )
+    group.add_argument(
+        "--reverse", action="store_true", help="run the ODE solver in reverse."
+    )
+    group.add_argument(
+        "--likelihood",
+        action="store_true",
+        help="Enable calculation of likelihood during the ODE solving process.",
+    )
+def parse_sde_args(parser):
+    group = parser.add_argument_group("SDE arguments")
+    group.add_argument(
+        "--sampling-method",
+        type=str,
+        default="Euler",
+        choices=["Euler", "Heun"],
+        help="the numerical method used for sampling the stochastic differential equation: 'Euler' for simplicity or 'Heun' for improved accuracy.",
+    )
+    group.add_argument(
+        "--diffusion-form",
+        type=str,
+        default="sigma",
+        choices=[
+            "constant",
+            "SBDM",
+            "sigma",
+            "linear",
+            "decreasing",
+            "increasing-decreasing",
+        ],
+        help="form of diffusion coefficient in the SDE",
+    )
+    group.add_argument(
+        "--diffusion-norm",
+        type=float,
+        default=1.0,
+        help="Normalizes the diffusion coefficient, affecting the scale of the stochastic component.",
+    )
+    group.add_argument(
+        "--last-step",
+        type=none_or_str,
+        default="Mean",
+        choices=[None, "Mean", "Tweedie", "Euler"],
+        help="form of last step taken in the SDE",
+    )
+    group.add_argument(
+        "--last-step-size", type=float, default=0.04, help="size of the last step taken"
+    )
+def find_free_port() -> int:
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    return port
+def main():
+    parser = argparse.ArgumentParser()
+    mode = "ODE"
+    parser.add_argument("--num_gpus", type=int, default=1)
+    parser.add_argument("--ckpt", type=str, default="./checkpoints")
+    parser.add_argument("--ema", type=bool, default=True)
+    parser.add_argument("--precision", default="bf16", choices=["bf16", "fp32"])
+    parse_transport_args(parser)
+    if mode == "ODE":
+        parse_ode_args(parser)
+        # Further processing for ODE
+    elif mode == "SDE":
+        parse_sde_args(parser)
+        # Further processing for SDE
+    args = parser.parse_known_args()[0]
+    if args.num_gpus != 1:
+        raise NotImplementedError("Multi-GPU Inference is not yet supported")
+    args.sampler_mode = mode
+    master_port = find_free_port()
+    processes = []
+    request_queues = []
+    response_queue = mp.Queue()
+    mp_barrier = mp.Barrier(args.num_gpus + 1)
+    for i in range(args.num_gpus):
+        request_queues.append(mp.Queue())
+        p = mp.Process(
+            target=model_main,
+            args=(
+                args,
+                master_port,
+                i,
+                request_queues[i],
+                response_queue if i == 0 else None,
+                mp_barrier,
+            ),
+        )
+        p.start()
+        processes.append(p)
+    with gr.Blocks() as demo:
+        with gr.Row():
+            gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                cap = gr.Textbox(
+                    lines=2,
+                    label="Caption",
+                    interactive=True,
+                    value="Miss Mexico portrait of the most beautiful mexican woman, Exquisite detail, 30-megapixel, 4k, 85-mm-lens, sharp-focus, f:8, "
+                    "ISO 100, shutter-speed 1:125, diffuse-back-lighting, award-winning photograph, small-catchlight, High-sharpness, facial-symmetry, 8k --q 2 --ar 18:32 --v 5",
+                )
+                with gr.Row():
+                    res_choices = ["1024x1024", "512x2048", "2048x512"] + [
+                        "(Extrapolation) 1664x1664",
+                        "(Extrapolation) 1024x2048",
+                        "(Extrapolation) 2048x1024",
+                    ]
+                    resolution = gr.Dropdown(
+                        value=res_choices[0], choices=res_choices, label="Resolution"
+                    )
+                with gr.Row():
+                    num_sampling_steps = gr.Slider(
+                        minimum=1,
+                        maximum=70,
+                        value=30,
+                        interactive=True,
+                        label="Sampling steps",
+                    )
+                    seed = gr.Slider(
+                        minimum=0,
+                        maximum=int(1e5),
+                        value=1,
+                        step=1,
+                        interactive=True,
+                        label="Seed (0 for random)",
+                    )
+                with gr.Accordion(
+                    "Advanced Settings for Resolution Extrapolation", open=False
+                ):
+                    with gr.Row():
+                        solver = gr.Dropdown(
+                            value="euler",
+                            choices=["euler", "dopri5", "dopri8"],
+                            label="solver",
+                        )
+                        t_shift = gr.Slider(
+                            minimum=1,
+                            maximum=20,
+                            value=6,
+                            step=1,
+                            interactive=True,
+                            label="Time shift",
+                        )
+                        cfg_scale = gr.Slider(
+                            minimum=1.0,
+                            maximum=20.0,
+                            value=4.0,
+                            interactive=True,
+                            label="CFG scale",
+                        )
+                    with gr.Row():
+                        ntk_scaling = gr.Checkbox(
+                            value=True,
+                            interactive=True,
+                            label="ntk scaling",
+                        )
+                        proportional_attn = gr.Checkbox(
+                            value=True,
+                            interactive=True,
+                            label="Proportional attention",
+                        )
+                with gr.Row():
+                    submit_btn = gr.Button("Submit", variant="primary")
+                    # reset_btn = gr.ClearButton([
+                    #     cap, resolution,
+                    #     num_sampling_steps, cfg_scale, solver,
+                    #     t_shift, seed,
+                    #     ntk_scaling, proportional_attn
+                    # ])
+            with gr.Column():
+                default_img = Image.open("./image.png")
+                output_img = gr.Image(
+                    label="Generated image",
+                    interactive=False,
+                    format="png",
+                    value=default_img,
+                )
+        with gr.Row():
+            gr.Examples(
+                examples,
+                [cap],
+                label="Examples",
+            )
+        def on_submit(*args):
+            for q in request_queues:
+                q.put(args)
+            result = response_queue.get()
+            if isinstance(result, ModelFailure):
+                raise RuntimeError
+            return result
+        submit_btn.click(
+            on_submit,
+            [
+                cap,
+                resolution,
+                num_sampling_steps,
+                cfg_scale,
+                solver,
+                t_shift,
+                seed,
+                ntk_scaling,
+                proportional_attn,
+            ],
+            [output_img],
+        )
+    mp_barrier.wait()
+    demo.queue().launch(share=True, server_name="0.0.0.0")
+if __name__ == "__main__":
+    os.system("mkdir -p ./checkpoints")
+    os.system("huggingface-cli download --resume-download Alpha-VLLM/Lumina-Next-T2I --local-dir ./checkpoints --local-dir-use-symlinks False")
+    mp.set_start_method("spawn")
+    main()

models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # from .model import DiT_Llama_5B_patch2
2	+ from .model import DiT_Llama_2B_patch2

models/components.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import warnings
+import torch
+import torch.nn as nn
+try:
+    from apex.normalization import FusedRMSNorm as RMSNorm
+except ImportError:
+    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
+    class RMSNorm(torch.nn.Module):
+        def __init__(self, dim: int, eps: float = 1e-6):
+            """
+            Initialize the RMSNorm normalization layer.
+            Args:
+                dim (int): The dimension of the input tensor.
+                eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+            Attributes:
+                eps (float): A small value added to the denominator for numerical stability.
+                weight (nn.Parameter): Learnable scaling parameter.
+            """
+            super().__init__()
+            self.eps = eps
+            self.weight = nn.Parameter(torch.ones(dim))
+        def _norm(self, x):
+            """
+            Apply the RMSNorm normalization to the input tensor.
+            Args:
+                x (torch.Tensor): The input tensor.
+            Returns:
+                torch.Tensor: The normalized tensor.
+            """
+            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        def forward(self, x):
+            """
+            Forward pass through the RMSNorm layer.
+            Args:
+                x (torch.Tensor): The input tensor.
+            Returns:
+                torch.Tensor: The output tensor after applying RMSNorm.
+            """
+            output = self._norm(x.float()).type_as(x)
+            return output * self.weight

models/model.py ADDED Viewed

	@@ -0,0 +1,908 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import functools
+import logging
+import math
+from typing import Optional, Tuple, List
+# from apex.normalization import FusedRMSNorm as RMSNorm
+from .components import RMSNorm
+import fairscale.nn.model_parallel.initialize as fs_init
+from fairscale.nn.model_parallel.layers import (
+    ColumnParallelLinear, RowParallelLinear, ParallelEmbedding,
+)
+from flash_attn import flash_attn_varlen_func
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+logger = logging.getLogger(__name__)
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#############################################################################
+#             Embedding Layers for Timesteps and Class Labels               #
+#############################################################################
+class ParallelTimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            ColumnParallelLinear(
+                frequency_embedding_size, hidden_size, bias=True,
+                gather_output=False,
+                init_method=functools.partial(nn.init.normal_, std=0.02),
+            ),
+            nn.SiLU(),
+            RowParallelLinear(
+                hidden_size, hidden_size, bias=True, input_is_parallel=True,
+                init_method=functools.partial(nn.init.normal_, std=0.02),
+            ),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(
+                start=0, end=half, dtype=torch.float32
+            ) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([
+                embedding, torch.zeros_like(embedding[:, :1])
+            ], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
+        return t_emb
+class ParallelLabelEmbedder(nn.Module):
+    r"""Embeds class labels into vector representations. Also handles label
+    dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = int(dropout_prob > 0)
+        self.embedding_table = ParallelEmbedding(
+            num_classes + use_cfg_embedding, hidden_size,
+            init_method=functools.partial(nn.init.normal_, std=0.02),
+        )
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(
+                labels.shape[0], device=labels.device
+            ) < self.dropout_prob
+            drop_ids = drop_ids.cuda()
+            dist.broadcast(
+                drop_ids,
+                fs_init.get_model_parallel_src_rank(),
+                fs_init.get_model_parallel_group(),
+            )
+            drop_ids = drop_ids.to(labels.device)
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#############################################################################
+#                               Core DiT Model                              #
+#############################################################################
+class Attention(nn.Module):
+    """Multi-head attention module."""
+    def __init__(self, dim: int, n_heads: int, n_kv_heads: Optional[int], qk_norm: bool, y_dim: int):
+        """
+        Initialize the Attention module.
+        Args:
+            dim (int): Number of input dimensions.
+            n_heads (int): Number of heads.
+            n_kv_heads (Optional[int]): Number of kv heads, if using GQA.
+        """
+        super().__init__()
+        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        model_parallel_size = fs_init.get_model_parallel_world_size()
+        self.n_local_heads = n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = dim // n_heads
+        self.wq = ColumnParallelLinear(
+            dim, n_heads * self.head_dim, bias=False, gather_output=False,
+            init_method=nn.init.xavier_uniform_,
+        )
+        self.wk = ColumnParallelLinear(
+            dim, self.n_kv_heads * self.head_dim, bias=False,
+            gather_output=False, init_method=nn.init.xavier_uniform_,
+        )
+        self.wv = ColumnParallelLinear(
+            dim, self.n_kv_heads * self.head_dim, bias=False,
+            gather_output=False, init_method=nn.init.xavier_uniform_,
+        )
+        if y_dim > 0:
+            self.wk_y = ColumnParallelLinear(
+                y_dim, self.n_kv_heads * self.head_dim, bias=False,
+                gather_output=False, init_method=nn.init.xavier_uniform_,
+            )
+            self.wv_y = ColumnParallelLinear(
+                y_dim, self.n_kv_heads * self.head_dim, bias=False,
+                gather_output=False, init_method=nn.init.xavier_uniform_,
+            )
+            self.gate = nn.Parameter(torch.zeros([self.n_local_heads]))
+        self.wo = RowParallelLinear(
+            n_heads * self.head_dim, dim, bias=False,
+            input_is_parallel=True, init_method=nn.init.xavier_uniform_,
+        )
+        if qk_norm:
+            self.q_norm = nn.LayerNorm(self.n_local_heads * self.head_dim)
+            self.k_norm = nn.LayerNorm(self.n_local_kv_heads * self.head_dim)
+            if y_dim > 0:
+                self.ky_norm = nn.LayerNorm(self.n_local_kv_heads * self.head_dim)
+            else:
+                self.ky_norm = nn.Identity()
+        else:
+            self.q_norm = self.k_norm = nn.Identity()
+            self.ky_norm = nn.Identity()
+        # for proportional attention computation
+        self.base_seqlen = None
+        self.proportional_attn = False
+    @staticmethod
+    def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+        """
+        Reshape frequency tensor for broadcasting it with another tensor.
+        This function reshapes the frequency tensor to have the same shape as
+        the target tensor 'x' for the purpose of broadcasting the frequency
+        tensor during element-wise operations.
+        Args:
+            freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+            x (torch.Tensor): Target tensor for broadcasting compatibility.
+        Returns:
+            torch.Tensor: Reshaped frequency tensor.
+        Raises:
+            AssertionError: If the frequency tensor doesn't match the expected
+                shape.
+            AssertionError: If the target tensor 'x' doesn't have the expected
+                number of dimensions.
+        """
+        ndim = x.ndim
+        assert 0 <= 1 < ndim
+        assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+        shape = [d if i == 1 or i == ndim - 1 else 1
+                 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+    @staticmethod
+    def apply_rotary_emb(
+        x_in: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Apply rotary embeddings to input tensors using the given frequency
+        tensor.
+        This function applies rotary embeddings to the given query 'xq' and
+        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
+        input tensors are reshaped as complex numbers, and the frequency tensor
+        is reshaped for broadcasting compatibility. The resulting tensors
+        contain rotary embeddings and are returned as real tensors.
+        Args:
+            x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
+                exponentials.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
+                and key tensor with rotary embeddings.
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
+            freqs_cis = freqs_cis.unsqueeze(2)
+            x_out = torch.view_as_real(x * freqs_cis).flatten(3)
+            return x_out.type_as(x_in)
+    # copied from huggingface modeling_llama.py
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        def _get_unpad_data(attention_mask):
+            seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+            indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+            max_seqlen_in_batch = seqlens_in_batch.max().item()
+            cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+            return (
+                indices,
+                cu_seqlens,
+                max_seqlen_in_batch,
+            )
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.n_local_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        y: torch.Tensor,
+        y_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+            x_mask:
+            freqs_cis:
+            y:
+            y_mask:
+        Returns:
+        """
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        dtype = xq.dtype
+        xq = self.q_norm(xq)
+        xk = self.k_norm(xk)
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xq = Attention.apply_rotary_emb(xq, freqs_cis=freqs_cis)
+        xk = Attention.apply_rotary_emb(xk, freqs_cis=freqs_cis)
+        xq, xk = xq.to(dtype), xk.to(dtype)
+        if dtype in [torch.float16, torch.bfloat16]:
+            # begin var_len flash attn
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                xq, xk, xv, x_mask, seqlen
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            if self.proportional_attn:
+                softmax_scale = math.sqrt(math.log(seqlen, self.base_seqlen) / self.head_dim)
+            else:
+                softmax_scale = math.sqrt(1 / self.head_dim)
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=0.,
+                causal=False,
+                softmax_scale=softmax_scale
+            )
+            output = pad_input(attn_output_unpad, indices_q, bsz, seqlen)
+            # end var_len_flash_attn
+        else:
+            output = F.scaled_dot_product_attention(
+                xq.permute(0, 2, 1, 3),
+                xk.permute(0, 2, 1, 3),
+                xv.permute(0, 2, 1, 3),
+                attn_mask=x_mask.bool().view(bsz, 1, 1, seqlen).expand(-1, self.n_local_heads, seqlen, -1),
+            ).permute(0, 2, 1, 3).to(dtype)
+        if hasattr(self, "wk_y"):
+            # todo better flash_attn support
+            yk = self.ky_norm(self.wk_y(y)).view(bsz, -1, self.n_local_kv_heads, self.head_dim)
+            yv = self.wv_y(y).view(bsz, -1, self.n_local_kv_heads, self.head_dim)
+            n_rep = self.n_local_heads // self.n_local_kv_heads
+            if n_rep >= 1:
+                yk = yk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+                yv = yv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            output_y = F.scaled_dot_product_attention(
+                xq.permute(0, 2, 1, 3),
+                yk.permute(0, 2, 1, 3),
+                yv.permute(0, 2, 1, 3),
+                y_mask.view(bsz, 1, 1, -1).expand(bsz, self.n_local_heads, seqlen, -1)
+            ).permute(0, 2, 1, 3)
+            output_y = output_y * self.gate.tanh().view(1, 1, -1, 1)
+            output = output + output_y
+        output = output.flatten(-2)
+        return self.wo(output)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple
+                of this value.
+            ffn_dim_multiplier (float, optional): Custom multiplier for hidden
+                dimension. Defaults to None.
+        Attributes:
+            w1 (ColumnParallelLinear): Linear transformation for the first
+                layer.
+            w2 (RowParallelLinear): Linear transformation for the second layer.
+            w3 (ColumnParallelLinear): Linear transformation for the third
+                layer.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * (
+            (hidden_dim + multiple_of - 1) // multiple_of
+        )
+        self.w1 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False,
+            init_method=nn.init.xavier_uniform_,
+        )
+        self.w2 = RowParallelLinear(
+            hidden_dim, dim, bias=False, input_is_parallel=True,
+            init_method=nn.init.xavier_uniform_,
+        )
+        self.w3 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False,
+            init_method=nn.init.xavier_uniform_,
+        )
+    # @torch.compile
+    def _forward_silu_gating(self, x1, x3):
+        return F.silu(x1) * x3
+    def forward(self, x):
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, dim: int, n_heads: int, n_kv_heads: int,
+                 multiple_of: int, ffn_dim_multiplier: float, norm_eps: float,
+                 qk_norm: bool, y_dim: int) -> None:
+        """
+        Initialize a TransformerBlock.
+        Args:
+            layer_id (int): Identifier for the layer.
+            dim (int): Embedding dimension of the input features.
+            n_heads (int): Number of attention heads.
+            n_kv_heads (Optional[int]): Number of attention heads in key and
+                value features (if using GQA), or set to None for the same as
+                query.
+            multiple_of (int):
+            ffn_dim_multiplier (float):
+            norm_eps (float):
+        Attributes:
+            n_heads (int): Number of attention heads.
+            dim (int): Dimension size of the model.
+            head_dim (int): Dimension size of each attention head.
+            attention (Attention): Attention module.
+            feed_forward (FeedForward): FeedForward module.
+            layer_id (int): Identifier for the layer.
+            attention_norm (RMSNorm): Layer normalization for attention output.
+            ffn_norm (RMSNorm): Layer normalization for feedforward output.
+        """
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+        self.attention = Attention(dim, n_heads, n_kv_heads, qk_norm, y_dim)
+        self.feed_forward = FeedForward(
+            dim=dim, hidden_dim=4 * dim, multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(dim, eps=norm_eps)
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            ColumnParallelLinear(
+                min(dim, 1024), 6 * dim, bias=True, gather_output=True,
+                init_method=nn.init.zeros_,
+            ),
+        )
+        self.attention_y_norm = RMSNorm(y_dim, eps=norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        y: torch.Tensor,
+        y_mask: torch.Tensor,
+        adaln_input: Optional[torch.Tensor] = None,
+    ):
+        """
+        Perform a forward pass through the TransformerBlock.
+        Args:
+            x (torch.Tensor): Input tensor.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+        Returns:
+            torch.Tensor: Output tensor after applying attention and
+                feedforward layers.
+        """
+        if adaln_input is not None:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = \
+                self.adaLN_modulation(adaln_input).chunk(6, dim=1)
+            x = x + self.attention_norm1(gate_msa.unsqueeze(1) * self.attention(
+                modulate(self.attention_norm(x), shift_msa, scale_msa),
+                x_mask,
+                freqs_cis,
+                self.attention_y_norm(y),
+                y_mask,
+            ))
+            d = x.shape[-1]
+            x = x + self.ffn_norm1(gate_mlp.unsqueeze(1) * self.feed_forward(
+                modulate(self.ffn_norm(x), shift_mlp, scale_mlp).view(-1, d),
+            ).view(*x.shape))
+        else:
+            x = x + self.attention_norm1(self.attention(
+                self.attention_norm(x), x_mask, freqs_cis, self.attention_y_norm(y), y_mask
+            ))
+            # for compatibility with torch.compile because the sequence length changes
+            B, L, D = x.shape
+            x = x.view(B*L, D)
+            x = x + self.ffn_norm1(self.feed_forward(self.ffn_norm(x)))
+            x = x.view(B, L, D)
+        return x
+class ParallelFinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6,
+        )
+        self.linear = ColumnParallelLinear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True,
+            init_method=nn.init.zeros_, gather_output=True,
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            ColumnParallelLinear(
+                min(hidden_size, 1024), 2 * hidden_size, bias=True,
+                init_method=nn.init.zeros_, gather_output=True,
+            ),
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT_Llama(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        dim: int = 4096,
+        n_layers: int = 32,
+        n_heads: int = 32,
+        n_kv_heads: Optional[int] = None,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        learn_sigma: bool = True,
+        qk_norm: bool = False,
+        cap_feat_dim: int = 5120,
+        rope_scaling_factor: float = 1.,
+        ntk_factor: float=1.
+    ) -> None:
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.x_embedder = ColumnParallelLinear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=dim,
+            bias=True,
+            gather_output=True,
+            init_method=nn.init.xavier_uniform_,
+        )
+        nn.init.constant_(self.x_embedder.bias, 0.)
+        self.t_embedder = ParallelTimestepEmbedder(min(dim, 1024))
+        self.cap_embedder = nn.Sequential(
+            nn.LayerNorm(cap_feat_dim),
+            ColumnParallelLinear(cap_feat_dim, min(dim, 1024), bias=True, gather_output=True,
+                                 init_method=nn.init.zeros_),
+        )
+        self.layers = nn.ModuleList([
+            TransformerBlock(layer_id, dim, n_heads, n_kv_heads, multiple_of,
+                             ffn_dim_multiplier, norm_eps, qk_norm, cap_feat_dim)
+            for layer_id in range(n_layers)
+        ])
+        self.final_layer = ParallelFinalLayer(dim, patch_size, self.out_channels)
+        assert (dim // n_heads) % 4 == 0, "2d rope needs head dim to be divisible by 4"
+        self.dim = dim
+        self.n_heads = n_heads
+        self.freqs_cis = DiT_Llama.precompute_freqs_cis(
+            dim // n_heads, 384, rope_scaling_factor=rope_scaling_factor, ntk_factor=ntk_factor
+        )
+        self.rope_scaling_factor = rope_scaling_factor
+        self.ntk_factor = ntk_factor
+        # self.eol_token = nn.Parameter(torch.empty(dim))
+        self.pad_token = nn.Parameter(torch.empty(dim))
+        # nn.init.normal_(self.eol_token, std=0.02)
+        nn.init.normal_(self.pad_token, std=0.02)
+    def unpatchify(self, x: torch.Tensor, img_size: List[Tuple[int, int]], return_tensor=False) -> List[torch.Tensor]:
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        pH = pW = self.patch_size
+        if return_tensor:
+            H, W = img_size[0]
+            B = x.size(0)
+            L = (H // pH) * (W // pW)
+            x = x[:, :L].view(B, H // pH, W // pW, pH, pW, self.out_channels)
+            x = x.permute(0, 5, 1, 3, 2, 4).flatten(4, 5).flatten(2, 3)
+            return x
+        else:
+            imgs = []
+            for i in range(x.size(0)):
+                H, W = img_size[i]
+                L = (H // pH) * (W // pW)
+                imgs.append(x[i][:L].view(
+                    H // pH, W // pW, pH, pW, self.out_channels
+                ).permute(4, 0, 2, 1, 3).flatten(3, 4).flatten(1, 2))
+        return imgs
+    def patchify_and_embed(
+        self,
+        x: List[torch.Tensor] | torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], torch.Tensor]:
+        self.freqs_cis = self.freqs_cis.to(x[0].device)
+        if isinstance(x, torch.Tensor):
+            pH = pW = self.patch_size
+            B, C, H, W = x.size()
+            x = x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 1, 3, 5).flatten(3)
+            x = self.x_embedder(x)
+            x = x.flatten(1, 2)
+            mask = torch.ones(x.shape[0], x.shape[1], dtype=torch.int32, device=x.device)
+            # leave the first line for text
+            return x, mask, [(H, W)] * B, self.freqs_cis[:H//pH, :W//pW].flatten(0,1).unsqueeze(0)
+        else:
+            pH = pW = self.patch_size
+            x_embed = []
+            freqs_cis = []
+            img_size = []
+            l_effective_seq_len = []
+            for img in x:
+                C, H, W = img.size()
+                item_freqs_cis = self.freqs_cis[:H//pH, :W//pW]
+                freqs_cis.append(item_freqs_cis.flatten(0,1))
+                img_size.append((H, W))
+                img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 0, 2, 4).flatten(2)
+                img = self.x_embedder(img)
+                img = img.flatten(0, 1)
+                l_effective_seq_len.append(len(img))
+                x_embed.append(img)
+            max_seq_len = max(l_effective_seq_len)
+            mask = torch.zeros(len(x), max_seq_len, dtype=torch.int32, device=x[0].device)
+            padded_x_embed = []
+            padded_freqs_cis = []
+            for i, (item_embed, item_freqs_cis, item_seq_len) in enumerate(zip(
+                x_embed, freqs_cis, l_effective_seq_len
+            )):
+                item_embed = torch.cat([
+                    item_embed,
+                    self.pad_token.view(1, -1).expand(max_seq_len - item_seq_len, -1),
+                ], dim=0)
+                item_freqs_cis = torch.cat([
+                    item_freqs_cis,
+                    item_freqs_cis[-1:].expand(max_seq_len - item_seq_len, -1)
+                ], dim=0)
+                padded_x_embed.append(item_embed)
+                padded_freqs_cis.append(item_freqs_cis)
+                mask[i][:item_seq_len] = 1
+            x_embed = torch.stack(padded_x_embed, dim=0)
+            freqs_cis = torch.stack(padded_freqs_cis, dim=0)
+            return x_embed, mask, img_size, freqs_cis
+    def forward(self, x, t, cap_feats, cap_mask):
+        """
+        Forward pass of DiT.
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x_is_tensor = isinstance(x, torch.Tensor)
+        x, mask, img_size, freqs_cis = self.patchify_and_embed(x)
+        freqs_cis = freqs_cis.to(x.device)
+        # cap_freqs_cis = self.freqs_cis[:1, :cap_feats.shape[1]].to(x.device)
+        t = self.t_embedder(t)                   # (N, D)
+        cap_mask_float = cap_mask.float().unsqueeze(-1)
+        cap_feats_pool = (cap_feats * cap_mask_float).sum(dim=1) / cap_mask_float.sum(dim=1)
+        cap_feats_pool = cap_feats_pool.to(cap_feats)
+        cap_emb = self.cap_embedder(cap_feats_pool)
+        adaln_input = t + cap_emb
+        cap_mask = cap_mask.bool()
+        for layer in self.layers:
+            x = layer(
+                x, mask, freqs_cis, cap_feats, cap_mask,
+                adaln_input=adaln_input
+            )
+        x = self.final_layer(x, adaln_input)
+        x = self.unpatchify(x, img_size, return_tensor=x_is_tensor)
+        if self.learn_sigma:
+            if x_is_tensor:
+                x, _ = x.chunk(2, dim=1)
+            else:
+                x = [_.chunk(2, dim=0)[0] for _ in x]
+        return x
+    def forward_with_cfg(self, x, t, cap_feats, cap_mask, cfg_scale, rope_scaling_factor=None, ntk_factor=None, base_seqlen: Optional[int] = None, proportional_attn: bool = False):
+        # """
+        # Forward pass of DiT, but also batches the unconditional forward pass
+        # for classifier-free guidance.
+        # """
+        # # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        # print(ntk_factor, rope_scaling_factor, self.ntk_factor, self.rope_scaling_factor)
+        if rope_scaling_factor is not None or ntk_factor is not None:
+            rope_scaling_factor = rope_scaling_factor if rope_scaling_factor is not None else self.rope_scaling_factor
+            ntk_factor = ntk_factor if ntk_factor is not None else self.ntk_factor
+            if rope_scaling_factor != self.rope_scaling_factor or ntk_factor != self.ntk_factor:
+                print(f"override freqs_cis, rope_scaling {rope_scaling_factor}, ntk {ntk_factor}", flush=True)
+                self.freqs_cis = DiT_Llama.precompute_freqs_cis(
+                    self.dim // self.n_heads, 384,
+                    rope_scaling_factor=rope_scaling_factor, ntk_factor=ntk_factor
+                )
+                self.rope_scaling_factor = rope_scaling_factor
+                self.ntk_factor = ntk_factor
+        if proportional_attn:
+            assert base_seqlen is not None
+            for layer in self.layers:
+                layer.attention.base_seqlen = base_seqlen
+                layer.attention.proportional_attn = proportional_attn
+        else:
+            for layer in self.layers:
+                layer.attention.base_seqlen = None
+                layer.attention.proportional_attn = proportional_attn
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, cap_feats, cap_mask)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+    @staticmethod
+    def precompute_freqs_cis(
+        dim: int,
+        end: int,
+        theta: float = 10000.0,
+        rope_scaling_factor: float = 1.0,
+        ntk_factor: float = 1.0
+    ):
+        """
+        Precompute the frequency tensor for complex exponentials (cis) with
+        given dimensions.
+        This function calculates a frequency tensor with complex exponentials
+        using the given dimension 'dim' and the end index 'end'. The 'theta'
+        parameter scales the frequencies. The returned tensor contains complex
+        values in complex64 data type.
+        Args:
+            dim (int): Dimension of the frequency tensor.
+            end (int): End index for precomputing frequencies.
+            theta (float, optional): Scaling factor for frequency computation.
+                Defaults to 10000.0.
+        Returns:
+            torch.Tensor: Precomputed frequency tensor with complex
+                exponentials.
+        """
+        theta = theta * ntk_factor
+        logger.info(f"theta {theta} rope scaling {rope_scaling_factor} ntk {ntk_factor}")
+        freqs = 1.0 / (theta ** (
+            torch.arange(0, dim, 4)[: (dim // 4)].float().cuda() / dim
+        ))
+        t = torch.arange(end, device=freqs.device, dtype=torch.float)  # type: ignore
+        t = t / rope_scaling_factor
+        freqs = torch.outer(t, freqs).float()  # type: ignore
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+        freqs_cis_h = freqs_cis.view(end, 1, dim//4, 1).repeat(1, end, 1, 1)
+        freqs_cis_w = freqs_cis.view(1, end, dim//4, 1).repeat(end, 1, 1, 1)
+        freqs_cis = torch.cat([freqs_cis_h, freqs_cis_w], dim=-1).flatten(2)
+        return freqs_cis
+    def parameter_count(self) -> int:
+        tensor_parallel_module_list = (
+            ColumnParallelLinear, RowParallelLinear, ParallelEmbedding,
+        )
+        total_params = 0
+        def _recursive_count_params(module):
+            nonlocal total_params
+            is_tp_module = isinstance(module, tensor_parallel_module_list)
+            for param in module.parameters(recurse=False):
+                total_params += param.numel() * (
+                    fs_init.get_model_parallel_world_size()
+                    if is_tp_module else 1
+                )
+            for submodule in module.children():
+                _recursive_count_params(submodule)
+        _recursive_count_params(self)
+        return total_params
+    def get_fsdp_wrap_module_list(self) -> List[nn.Module]:
+        return list(self.layers)
+#############################################################################
+#                                 DiT Configs                               #
+#############################################################################
+def DiT_Llama_600M_patch2(**kwargs):
+    return DiT_Llama(
+        patch_size=2, dim=1536, n_layers=16, n_heads=32, **kwargs
+    )
+def DiT_Llama_2B_patch2(**kwargs):
+    return DiT_Llama(
+        patch_size=2, dim=2304, n_layers=24, n_heads=32, **kwargs
+    )
+def DiT_Llama_3B_patch2(**kwargs):
+    return DiT_Llama(
+        patch_size=2, dim=3072, n_layers=32, n_heads=32, **kwargs
+    )
+def DiT_Llama_7B_patch2(**kwargs):
+    return DiT_Llama(
+        patch_size=2, dim=4096, n_layers=32, n_heads=32, **kwargs
+    )

models/model_5b.py ADDED Viewed

	@@ -0,0 +1,894 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import functools
+import math
+from typing import Optional, Tuple, List
+from .components import RMSNorm
+import fairscale.nn.model_parallel.initialize as fs_init
+from fairscale.nn.model_parallel.layers import (
+    ColumnParallelLinear, RowParallelLinear, ParallelEmbedding,
+)
+from flash_attn import flash_attn_varlen_func
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#############################################################################
+#             Embedding Layers for Timesteps and Class Labels               #
+#############################################################################
+class ParallelTimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            ColumnParallelLinear(
+                frequency_embedding_size, hidden_size, bias=True,
+                gather_output=False,
+                init_method=functools.partial(nn.init.normal_, std=0.02),
+            ),
+            nn.SiLU(),
+            RowParallelLinear(
+                hidden_size, hidden_size, bias=True, input_is_parallel=True,
+                init_method=functools.partial(nn.init.normal_, std=0.02),
+            ),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(
+                start=0, end=half, dtype=torch.float32
+            ) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([
+                embedding, torch.zeros_like(embedding[:, :1])
+            ], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
+        return t_emb
+class ParallelLabelEmbedder(nn.Module):
+    r"""Embeds class labels into vector representations. Also handles label
+    dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = int(dropout_prob > 0)
+        self.embedding_table = ParallelEmbedding(
+            num_classes + use_cfg_embedding, hidden_size,
+            init_method=functools.partial(nn.init.normal_, std=0.02),
+        )
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(
+                labels.shape[0], device=labels.device
+            ) < self.dropout_prob
+            drop_ids = drop_ids.cuda()
+            dist.broadcast(
+                drop_ids,
+                fs_init.get_model_parallel_src_rank(),
+                fs_init.get_model_parallel_group(),
+            )
+            drop_ids = drop_ids.to(labels.device)
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#############################################################################
+#                               Core DiT Model                              #
+#############################################################################
+class Attention(nn.Module):
+    """Multi-head attention module."""
+    def __init__(self, dim: int, n_heads: int, n_kv_heads: Optional[int], qk_norm: bool, y_dim: int):
+        """
+        Initialize the Attention module.
+        Args:
+            dim (int): Number of input dimensions.
+            n_heads (int): Number of heads.
+            n_kv_heads (Optional[int]): Number of kv heads, if using GQA.
+        Attributes:
+            n_kv_heads (int): Number of key and value heads.
+            n_local_heads (int): Number of local query heads.
+            n_local_kv_heads (int): Number of local key and value heads.
+            n_rep (int): Number of repetitions for local heads.
+            head_dim (int): Dimension size of each attention head.
+            wq (ColumnParallelLinear): Linear transformation for queries.
+            wk (ColumnParallelLinear): Linear transformation for keys.
+            wv (ColumnParallelLinear): Linear transformation for values.
+            wo (RowParallelLinear): Linear transformation for output.
+            cache_k (torch.Tensor): Cached keys for attention.
+            cache_v (torch.Tensor): Cached values for attention.
+        """
+        super().__init__()
+        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        model_parallel_size = fs_init.get_model_parallel_world_size()
+        self.n_local_heads = n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = dim // n_heads
+        self.wq = ColumnParallelLinear(
+            dim, n_heads * self.head_dim, bias=False, gather_output=False,
+            init_method=nn.init.xavier_uniform_,
+        )
+        self.wk = ColumnParallelLinear(
+            dim, self.n_kv_heads * self.head_dim, bias=False,
+            gather_output=False, init_method=nn.init.xavier_uniform_,
+        )
+        self.wv = ColumnParallelLinear(
+            dim, self.n_kv_heads * self.head_dim, bias=False,
+            gather_output=False, init_method=nn.init.xavier_uniform_,
+        )
+        if y_dim > 0:
+            self.wk_y = ColumnParallelLinear(
+                y_dim, self.n_kv_heads * self.head_dim, bias=False,
+                gather_output=False, init_method=nn.init.xavier_uniform_,
+            )
+            self.wv_y = ColumnParallelLinear(
+                y_dim, self.n_kv_heads * self.head_dim, bias=False,
+                gather_output=False, init_method=nn.init.xavier_uniform_,
+            )
+            self.gate = nn.Parameter(torch.zeros([self.n_local_heads]))
+        self.wo = RowParallelLinear(
+            n_heads * self.head_dim, dim, bias=False,
+            input_is_parallel=True, init_method=nn.init.xavier_uniform_,
+        )
+        if qk_norm:
+            self.q_norm = nn.LayerNorm(self.n_local_heads * self.head_dim)
+            self.k_norm = nn.LayerNorm(self.n_local_kv_heads * self.head_dim)
+            if y_dim > 0:
+                self.ky_norm = nn.LayerNorm(self.n_local_kv_heads * self.head_dim)
+            else:
+                self.ky_norm = nn.Identity()
+        else:
+            self.q_norm = self.k_norm = nn.Identity()
+            self.ky_norm = nn.Identity()
+        # for proportional attention computation
+        self.base_seqlen = None
+        self.proportional_attn = False
+    @staticmethod
+    def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+        """
+        Reshape frequency tensor for broadcasting it with another tensor.
+        This function reshapes the frequency tensor to have the same shape as
+        the target tensor 'x' for the purpose of broadcasting the frequency
+        tensor during element-wise operations.
+        Args:
+            freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+            x (torch.Tensor): Target tensor for broadcasting compatibility.
+        Returns:
+            torch.Tensor: Reshaped frequency tensor.
+        Raises:
+            AssertionError: If the frequency tensor doesn't match the expected
+                shape.
+            AssertionError: If the target tensor 'x' doesn't have the expected
+                number of dimensions.
+        """
+        ndim = x.ndim
+        assert 0 <= 1 < ndim
+        assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+        shape = [d if i == 1 or i == ndim - 1 else 1
+                 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+    @staticmethod
+    def apply_rotary_emb(
+        xq: torch.Tensor,
+        xk: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply rotary embeddings to input tensors using the given frequency
+        tensor.
+        This function applies rotary embeddings to the given query 'xq' and
+        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
+        input tensors are reshaped as complex numbers, and the frequency tensor
+        is reshaped for broadcasting compatibility. The resulting tensors
+        contain rotary embeddings and are returned as real tensors.
+        Args:
+            xq (torch.Tensor): Query tensor to apply rotary embeddings.
+            xk (torch.Tensor): Key tensor to apply rotary embeddings.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
+                exponentials.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
+                and key tensor with rotary embeddings.
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+            freqs_cis = Attention.reshape_for_broadcast(freqs_cis, xq_)
+            xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+            return xq_out.type_as(xq), xk_out.type_as(xk)
+    # copied from huggingface modeling_llama.py
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        def _get_unpad_data(attention_mask):
+            seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+            indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+            max_seqlen_in_batch = seqlens_in_batch.max().item()
+            cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+            return (
+                indices,
+                cu_seqlens,
+                max_seqlen_in_batch,
+            )
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.n_local_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+    def forward(
+        self,
+        x: torch.Tensor, x_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        y: torch.Tensor, y_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the attention module.
+        Args:
+            x (torch.Tensor): Input tensor.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor.
+        Returns:
+            torch.Tensor: Output tensor after attention.
+        """
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        dtype = xq.dtype
+        xq = self.q_norm(xq)
+        xk = self.k_norm(xk)
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xq, xk = Attention.apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        xq, xk = xq.to(dtype), xk.to(dtype)
+        if dtype in [torch.float16, torch.bfloat16]:
+            # begin var_len flash attn
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                xq, xk, xv, x_mask, seqlen
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            if self.proportional_attn:
+                softmax_scale = math.sqrt(math.log(seqlen, self.base_seqlen) / self.head_dim)
+            else:
+                softmax_scale = math.sqrt(1 / self.head_dim)
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=0.,
+                causal=False,
+                softmax_scale=softmax_scale
+            )
+            output = pad_input(attn_output_unpad, indices_q, bsz, seqlen)
+            # end var_len_flash_attn
+        else:
+            output = F.scaled_dot_product_attention(
+                xq.permute(0, 2, 1, 3),
+                xk.permute(0, 2, 1, 3),
+                xv.permute(0, 2, 1, 3),
+                attn_mask=x_mask.bool().view(bsz, 1, 1, seqlen).expand(-1, self.n_local_heads, seqlen, -1),
+            ).permute(0, 2, 1, 3).to(dtype)
+        if hasattr(self, "wk_y"):
+            yk = self.ky_norm(self.wk_y(y)).view(bsz, -1, self.n_local_kv_heads, self.head_dim)
+            yv = self.wv_y(y).view(bsz, -1, self.n_local_kv_heads, self.head_dim)
+            n_rep = self.n_local_heads // self.n_local_kv_heads
+            if n_rep >= 1:
+                yk = yk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+                yv = yv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            output_y = F.scaled_dot_product_attention(
+                xq.permute(0, 2, 1, 3),
+                yk.permute(0, 2, 1, 3),
+                yv.permute(0, 2, 1, 3),
+                y_mask.view(bsz, 1, 1, -1).expand(bsz, self.n_local_heads, seqlen, -1)
+            ).permute(0, 2, 1, 3)
+            output_y = output_y * self.gate.tanh().view(1, 1, -1, 1)
+            output = output + output_y
+        output = output.flatten(-2)
+        return self.wo(output)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple
+                of this value.
+            ffn_dim_multiplier (float, optional): Custom multiplier for hidden
+                dimension. Defaults to None.
+        Attributes:
+            w1 (ColumnParallelLinear): Linear transformation for the first
+                layer.
+            w2 (RowParallelLinear): Linear transformation for the second layer.
+            w3 (ColumnParallelLinear): Linear transformation for the third
+                layer.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * (
+            (hidden_dim + multiple_of - 1) // multiple_of
+        )
+        self.w1 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False,
+            init_method=nn.init.xavier_uniform_,
+        )
+        self.w2 = RowParallelLinear(
+            hidden_dim, dim, bias=False, input_is_parallel=True,
+            init_method=nn.init.xavier_uniform_,
+        )
+        self.w3 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False,
+            init_method=nn.init.xavier_uniform_,
+        )
+    # @torch.compile
+    def _forward_silu_gating(self, x1, x3):
+        return F.silu(x1) * x3
+    def forward(self, x):
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, dim: int, n_heads: int, n_kv_heads: int,
+                 multiple_of: int, ffn_dim_multiplier: float, norm_eps: float,
+                 qk_norm: bool, y_dim: int) -> None:
+        """
+        Initialize a TransformerBlock.
+        Args:
+            layer_id (int): Identifier for the layer.
+            dim (int): Embedding dimension of the input features.
+            n_heads (int): Number of attention heads.
+            n_kv_heads (Optional[int]): Number of attention heads in key and
+                value features (if using GQA), or set to None for the same as
+                query.
+            multiple_of (int):
+            ffn_dim_multiplier (float):
+            norm_eps (float):
+        Attributes:
+            n_heads (int): Number of attention heads.
+            dim (int): Dimension size of the model.
+            head_dim (int): Dimension size of each attention head.
+            attention (Attention): Attention module.
+            feed_forward (FeedForward): FeedForward module.
+            layer_id (int): Identifier for the layer.
+            attention_norm (RMSNorm): Layer normalization for attention output.
+            ffn_norm (RMSNorm): Layer normalization for feedforward output.
+        """
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+        self.attention = Attention(dim, n_heads, n_kv_heads, qk_norm, y_dim)
+        self.feed_forward = FeedForward(
+            dim=dim, hidden_dim=4 * dim, multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm = RMSNorm(dim, eps=norm_eps)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            ColumnParallelLinear(
+                min(dim, 1024), 6 * dim, bias=True, gather_output=True,
+                init_method=nn.init.zeros_,
+            ),
+        )
+        self.attention_y_norm = RMSNorm(y_dim, eps=norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        y: torch.Tensor,
+        y_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        adaln_input: Optional[torch.Tensor] = None,
+    ):
+        """
+        Perform a forward pass through the TransformerBlock.
+        Args:
+            x (torch.Tensor): Input tensor.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+            mask (torch.Tensor, optional): Masking tensor for attention.
+                Defaults to None.
+        Returns:
+            torch.Tensor: Output tensor after applying attention and
+                feedforward layers.
+        """
+        if adaln_input is not None:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = \
+                self.adaLN_modulation(adaln_input).chunk(6, dim=1)
+            x = x + gate_msa.unsqueeze(1) * self.attention(
+                modulate(self.attention_norm(x), shift_msa, scale_msa),
+                x_mask,
+                freqs_cis,
+                self.attention_y_norm(y), y_mask,
+            )
+            x = x + gate_mlp.unsqueeze(1) * self.feed_forward(
+                modulate(self.ffn_norm(x), shift_mlp, scale_mlp),
+            )
+        else:
+            x = x + self.attention(
+                self.attention_norm(x), x_mask, freqs_cis, self.attention_y_norm(y), y_mask,
+            )
+            x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+class ParallelFinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6,
+        )
+        self.linear = ColumnParallelLinear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True,
+            init_method=nn.init.zeros_, gather_output=True,
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            ColumnParallelLinear(
+                min(hidden_size, 1024), 2 * hidden_size, bias=True,
+                init_method=nn.init.zeros_, gather_output=True,
+            ),
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT_Llama(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        dim: int = 4096,
+        n_layers: int = 32,
+        n_heads: int = 32,
+        n_kv_heads: Optional[int] = None,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        learn_sigma: bool = True,
+        qk_norm: bool = False,
+        cap_feat_dim: int = 5120,
+        rope_scaling_factor: float = 1.,
+        ntk_factor: float=1.
+    ) -> None:
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.x_embedder = ColumnParallelLinear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=dim,
+            bias=True,
+            gather_output=True,
+            init_method=nn.init.xavier_uniform_,
+        )
+        nn.init.constant_(self.x_embedder.bias, 0.)
+        self.t_embedder = ParallelTimestepEmbedder(min(dim, 1024))
+        self.cap_embedder = nn.Sequential(
+            nn.LayerNorm(cap_feat_dim),
+            ColumnParallelLinear(
+                cap_feat_dim, min(dim, 1024), bias=True, gather_output=True,
+                init_method=nn.init.zeros_
+                ),
+        )
+        self.layers = nn.ModuleList([
+            TransformerBlock(layer_id, dim, n_heads, n_kv_heads, multiple_of,
+                             ffn_dim_multiplier, norm_eps, qk_norm, cap_feat_dim)
+            for layer_id in range(n_layers)
+        ])
+        self.final_layer = ParallelFinalLayer(dim, patch_size, self.out_channels)
+        self.freqs_cis = DiT_Llama.precompute_freqs_cis(
+            dim // n_heads, 40000, rope_scaling_factor=rope_scaling_factor, ntk_factor=ntk_factor
+        )
+        self.dim = dim
+        self.n_heads = n_heads
+        self.rope_scaling_factor = rope_scaling_factor
+        self.ntk_factor = ntk_factor
+        self.eol_token = nn.Parameter(torch.empty(dim))
+        self.pad_token = nn.Parameter(torch.empty(dim))
+        nn.init.normal_(self.eol_token, std=0.02)
+        nn.init.normal_(self.pad_token, std=0.02)
+    def unpatchify(self, x: torch.Tensor, img_size: List[Tuple[int, int]], return_tensor=False) -> List[torch.Tensor]:
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        pH = pW = self.patch_size
+        if return_tensor:
+            H, W = img_size[0]
+            B = x.size(0)
+            L = (H // pH) * (W // pW + 1)  # one additional for eol
+            x = x[:, :L].view(B, H // pH, W // pW + 1, pH, pW, self.out_channels)
+            x = x[:, :, :-1]
+            x = x.permute(0, 5, 1, 3, 2, 4).flatten(4, 5).flatten(2, 3)
+            return x
+        else:
+            imgs = []
+            for i in range(x.size(0)):
+                H, W = img_size[i]
+                L = (H // pH) * (W // pW + 1)
+                imgs.append(x[i][:L].view(
+                    H // pH, W // pW + 1, pH, pW, self.out_channels
+                )[:, :-1, :, :, :].permute(4, 0, 2, 1, 3).flatten(3, 4).flatten(1, 2))
+        return imgs
+    def patchify_and_embed(
+        self,
+        x: List[torch.Tensor] | torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]]]:
+        if isinstance(x, torch.Tensor):
+            pH = pW = self.patch_size
+            B, C, H, W = x.size()
+            x = x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 1, 3, 5).flatten(3)
+            x = self.x_embedder(x)
+            x = torch.cat([
+                x,
+                self.eol_token.view(1, 1, 1, -1).expand(B, H // pH, 1, -1),
+            ], dim=2)
+            x = x.flatten(1, 2)
+            mask = torch.ones(x.shape[0], x.shape[1], dtype=torch.int32, device=x.device)
+            return x, mask, [(H, W)] * B
+        else:
+            pH = pW = self.patch_size
+            x_embed = []
+            img_size = []
+            l_effective_seq_len = []
+            for img in x:
+                C, H, W = img.size()
+                img_size.append((H, W))
+                img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 0, 2, 4).flatten(2)
+                img = self.x_embedder(img)
+                img = torch.cat([
+                    img,
+                    self.eol_token.view(1, 1, -1).expand(H // pH, 1, -1),
+                ], dim=1)
+                img = img.flatten(0, 1)
+                l_effective_seq_len.append(len(img))
+                x_embed.append(img)
+            max_seq_len = max(l_effective_seq_len)
+            mask = torch.zeros(len(x), max_seq_len, dtype=torch.int32, device=x[0].device)
+            padded_x_embed = []
+            for i, (item_embed, item_seq_len) in enumerate(zip(x_embed, l_effective_seq_len)):
+                item_embed = torch.cat([
+                        item_embed,
+                        self.pad_token.view(1, -1).expand(max_seq_len - item_seq_len, -1),
+                ], dim=0)
+                padded_x_embed.append(item_embed)
+                mask[i][:item_seq_len] = 1
+            x_embed = torch.stack(padded_x_embed, dim=0)
+            return x_embed, mask, img_size
+    def forward(self, x, t, cap_feats, cap_mask):
+        """
+        Forward pass of DiT.
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x_is_tensor = isinstance(x, torch.Tensor)
+        x, mask, img_size = self.patchify_and_embed(x)
+        self.freqs_cis = self.freqs_cis.to(x.device)
+        t = self.t_embedder(t)                   # (N, D)
+        cap_mask_float = cap_mask.float().unsqueeze(-1)
+        cap_feats_pool = (cap_feats * cap_mask_float).sum(dim=1) / cap_mask_float.sum(dim=1)
+        cap_feats_pool = cap_feats_pool.to(cap_feats)
+        cap_emb = self.cap_embedder(cap_feats_pool)
+        adaln_input = t + cap_emb
+        cap_mask = cap_mask.bool()
+        for layer in self.layers:
+            x = layer(
+                x, mask, cap_feats, cap_mask, self.freqs_cis[:x.size(1)],
+                adaln_input=adaln_input
+            )
+        x = self.final_layer(x, adaln_input)
+        x = self.unpatchify(x, img_size, return_tensor=x_is_tensor)
+        if self.learn_sigma:
+            if x_is_tensor:
+                x, _ = x.chunk(2, dim=1)
+            else:
+                x = [_.chunk(2, dim=0)[0] for _ in x]
+        return x
+    def forward_with_cfg(
+        self,
+        x,
+        t,
+        cap_feats,
+        cap_mask,
+        cfg_scale,
+        rope_scaling_factor=None,
+        ntk_factor=None,
+        base_seqlen: Optional[int] = None,
+        proportional_attn: bool = False
+    ):
+        """
+        Forward pass of DiT, but also batches the unconditional forward pass
+        for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        if rope_scaling_factor is not None or ntk_factor is not None:
+            rope_scaling_factor = rope_scaling_factor if rope_scaling_factor is not None else self.rope_scaling_factor
+            ntk_factor = ntk_factor if ntk_factor is not None else self.ntk_factor
+            if rope_scaling_factor != self.rope_scaling_factor or ntk_factor != self.ntk_factor:
+                print(f"override freqs_cis, rope_scaling {rope_scaling_factor}, ntk {ntk_factor}", flush=True)
+                self.freqs_cis = DiT_Llama.precompute_freqs_cis(
+                    self.dim // self.n_heads, 40000,
+                    rope_scaling_factor=rope_scaling_factor, ntk_factor=ntk_factor
+                )
+                self.rope_scaling_factor = rope_scaling_factor
+                self.ntk_factor = ntk_factor
+        if proportional_attn:
+            assert base_seqlen is not None
+            for layer in self.layers:
+                layer.attention.base_seqlen = base_seqlen
+                layer.attention.proportional_attn = proportional_attn
+        else:
+            for layer in self.layers:
+                layer.attention.base_seqlen = None
+                layer.attention.proportional_attn = proportional_attn
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self(combined, t, cap_feats, cap_mask)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+    @staticmethod
+    def precompute_freqs_cis(
+        dim: int,
+        end: int,
+        theta: float = 10000.0,
+        rope_scaling_factor: float = 1.0,
+        ntk_factor: float = 1.0
+    ):
+        """
+        Precompute the frequency tensor for complex exponentials (cis) with
+        given dimensions.
+        This function calculates a frequency tensor with complex exponentials
+        using the given dimension 'dim' and the end index 'end'. The 'theta'
+        parameter scales the frequencies. The returned tensor contains complex
+        values in complex64 data type.
+        Args:
+            dim (int): Dimension of the frequency tensor.
+            end (int): End index for precomputing frequencies.
+            theta (float, optional): Scaling factor for frequency computation.
+                Defaults to 10000.0.
+        Returns:
+            torch.Tensor: Precomputed frequency tensor with complex
+                exponentials.
+        """
+        theta = theta * ntk_factor
+        print(f"theta {theta} rope scaling {rope_scaling_factor} ntk {ntk_factor}")
+        freqs = 1.0 / (theta ** (
+            torch.arange(0, dim, 2)[: (dim // 2)].float().cuda() / dim
+        ))
+        t = torch.arange(end, device=freqs.device, dtype=torch.float)  # type: ignore
+        t = t / rope_scaling_factor
+        freqs = torch.outer(t, freqs).float()  # type: ignore
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+        return freqs_cis
+    def parameter_count(self) -> int:
+        tensor_parallel_module_list = (
+            ColumnParallelLinear, RowParallelLinear, ParallelEmbedding,
+        )
+        total_params = 0
+        def _recursive_count_params(module):
+            nonlocal total_params
+            is_tp_module = isinstance(module, tensor_parallel_module_list)
+            for param in module.parameters(recurse=False):
+                total_params += param.numel() * (
+                    fs_init.get_model_parallel_world_size()
+                    if is_tp_module else 1
+                )
+            for submodule in module.children():
+                _recursive_count_params(submodule)
+        _recursive_count_params(self)
+        return total_params
+    def get_fsdp_wrap_module_list(self) -> List[nn.Module]:
+        return list(self.layers)
+#############################################################################
+#                                 DiT Configs                               #
+#############################################################################
+def DiT_Llama_2B_patch2(**kwargs):
+        return DiT_Llama(
+            patch_size=2, dim=2304, n_layers=24, n_heads=32, **kwargs
+        )
+def DiT_Llama_5B_patch2(**kwargs):
+    return DiT_Llama(
+        patch_size=2, dim=3072, n_layers=32, n_heads=32, **kwargs
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+transformers
+diffusers
+huggingface_hub
+gradio
+torch
+# torch==2.2.2+cu121
+fairscale
+numpy
+pillow
+torchdiffeq
+click
+git+https://github.com/Alpha-VLLM/Lumina-T2X