PicoAudio2

Running on Zero

App Files Files Community

rookie9 commited on Sep 29

Commit

f582ec6

verified ·

1 Parent(s): 39cfe13

Upload 77 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +77 -0
models/__pycache__/common.cpython-310.pyc +0 -0
models/__pycache__/content_adapter.cpython-310.pyc +0 -0
models/__pycache__/diffusion.cpython-310.pyc +0 -0
models/__pycache__/diffusion_cfg.cpython-310.pyc +0 -0
models/__pycache__/diffusion_cfg_new.cpython-310.pyc +0 -0
models/__pycache__/diffusion_content_cfg.cpython-310.pyc +0 -0
models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc +0 -0
models/autoencoder/autoencoder_base.py +22 -0
models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc +0 -0
models/autoencoder/waveform/stable_vae.py +537 -0
models/common.py +69 -0
models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/text_encoder.cpython-310.pyc +0 -0
models/content_encoder/caption_encoder.py +116 -0
models/content_encoder/text_encoder.py +76 -0
models/diffusion.py +398 -0
models/dit/__pycache__/attention.cpython-310.pyc +0 -0
models/dit/__pycache__/audio_dit.cpython-310.pyc +0 -0
models/dit/__pycache__/mask_dit.cpython-310.pyc +0 -0
models/dit/__pycache__/modules.cpython-310.pyc +0 -0
models/dit/__pycache__/rotary.cpython-310.pyc +0 -0
models/dit/__pycache__/span_mask.cpython-310.pyc +0 -0
models/dit/attention.py +350 -0
models/dit/audio_diffsingernet_dit.py +520 -0
models/dit/audio_dit.py +549 -0
models/dit/mask_dit.py +823 -0
models/dit/modules.py +445 -0
models/dit/rotary.py +88 -0
models/dit/span_mask.py +149 -0

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gradio as gr
+import os
+import json
+import torch
+import soundfile as sf
+import numpy as np
+from pathlib import Path
+from transformers import AutoModel
+#from utils.llm import get_time_info
+from utils.llm_xiapi import get_time_info
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device)
+print("ok")
+def is_tdc_format_valid(tdc_str):
+    try:
+        for event_onset in tdc_str.split('--'):
+            event, instance = event_onset.split('__')
+            for start_end in instance.split('_'):
+                start, end = start_end.split('-')
+        return True
+    except Exception:
+        return False
+def infer(input_text, input_onset, input_length, time_control):
+    # para
+    if input_onset and not is_tdc_format_valid(input_onset):
+        input_onset = "random"
+    if time_control:
+        if not input_onset or not input_length:
+            input_json = json.loads(get_time_info(input_text))
+            input_onset, input_length = input_json["onset"], input_json["length"]
+    else:
+        input_onset = input_onset if input_onset else "random"
+        input_length = input_length if input_length else "10.0"
+    content = {
+        "caption": input_text,
+        "onset": input_onset,
+        "length": input_length
+    }
+    with torch.no_grad():
+        waveform = model(content)
+        output_wav = "output.wav"
+        sf.write(
+            output_wav,
+            waveform[0, 0].cpu().numpy(),
+            samplerate=exp_config["sample_rate"],
+        )
+    return output_wav, str(input_onset)
+demo = gr.Interface(
+    fn=infer,
+    inputs=[
+        gr.Textbox(label="TCC (caption, required)", value="a dog barks"),
+        gr.Textbox(label="TDC (optional, see format)", value="random"),
+        gr.Textbox(label="Length (seconds, optional)", value="10.0"),
+        gr.Checkbox(label="Enable Time Control", value=False),
+    ],
+    outputs=[
+        gr.Audio(label="Generated Audio"),
+        gr.Textbox(label="Final TDC Used (input_onset)")
+    ],
+    title="PicoAudio2 Online Inference",
+    description=(
+        "TCC (caption) is neto generate audio. "
+        "If you need time control, please enter TDC and length (in seconds). "
+        "Alternatively, you can let the LLM generate TDC, but API quota limits may affect availability. "
+        "TDC format: \"event1__start1-end1_start2-end2--event2__start1-end1\", for example: "
+        "\"a_dog_barks__1.0-2.0_3.0-4.0--a_man_speaks__5.0-6.0\"."
+        "If the format of TDC is wrong or no input length, the model will generate audio without temporal control. Sorry!"
+    )
+)
+if __name__ == "__main__":
+    demo.launch()

models/__pycache__/common.cpython-310.pyc ADDED Viewed

Binary file (2.94 kB). View file

models/__pycache__/content_adapter.cpython-310.pyc ADDED Viewed

Binary file (3.87 kB). View file

models/__pycache__/diffusion.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

models/__pycache__/diffusion_cfg.cpython-310.pyc ADDED Viewed

Binary file (18.9 kB). View file

models/__pycache__/diffusion_cfg_new.cpython-310.pyc ADDED Viewed

Binary file (18.8 kB). View file

models/__pycache__/diffusion_content_cfg.cpython-310.pyc ADDED Viewed

Binary file (18.5 kB). View file

models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc ADDED Viewed

Binary file (1.06 kB). View file

models/autoencoder/autoencoder_base.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from abc import abstractmethod, ABC
+from typing import Sequence
+import torch
+import torch.nn as nn
+class AutoEncoderBase(ABC):
+    def __init__(
+        self, downsampling_ratio: int, sample_rate: int,
+        latent_shape: Sequence[int | None]
+    ):
+        self.downsampling_ratio = downsampling_ratio
+        self.sample_rate = sample_rate
+        self.latent_token_rate = sample_rate // downsampling_ratio
+        self.latent_shape = latent_shape
+        self.time_dim = latent_shape.index(None) + 1  # the first dim is batch
+    @abstractmethod
+    def encode(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        ...

models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc ADDED Viewed

Binary file (12 kB). View file

models/autoencoder/waveform/stable_vae.py ADDED Viewed

	@@ -0,0 +1,537 @@

+from typing import Any, Literal, Callable
+import math
+from pathlib import Path
+import torch
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+import torchaudio
+from alias_free_torch import Activation1d
+from models.common import LoadPretrainedBase
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from utils.torch_utilities import remove_key_prefix_factory, create_mask_from_length
+# jit script make it 1.4x faster and save GPU memory
+@torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + (1.0 / (beta+0.000000001)) * pow(torch.sin(x * alpha), 2)
+class SnakeBeta(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        alpha=1.0,
+        alpha_trainable=True,
+        alpha_logscale=True
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:
+            # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:
+            # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        # self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = snake_beta(x, alpha, beta)
+        return x
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+def get_activation(
+    activation: Literal["elu", "snake", "none"],
+    antialias=False,
+    channels=None
+) -> nn.Module:
+    if activation == "elu":
+        act = nn.ELU()
+    elif activation == "snake":
+        act = SnakeBeta(channels)
+    elif activation == "none":
+        act = nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {activation}")
+    if antialias:
+        act = Activation1d(act)
+    return act
+class ResidualUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilation,
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        self.dilation = dilation
+        padding = (dilation * (7-1)) // 2
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                dilation=dilation,
+                padding=padding
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels
+            ),
+            WNConv1d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=1
+            )
+        )
+    def forward(self, x):
+        res = x
+        #x = checkpoint(self.layers, x)
+        x = self.layers(x)
+        return x + res
+class EncoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        self.layers = nn.Sequential(
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=1,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=3,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=9,
+                use_snake=use_snake
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2)
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False
+    ):
+        super().__init__()
+        if use_nearest_upsample:
+            upsample_layer = nn.Sequential(
+                nn.Upsample(scale_factor=stride, mode="nearest"),
+                WNConv1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=2 * stride,
+                    stride=1,
+                    bias=False,
+                    padding='same'
+                )
+            )
+        else:
+            upsample_layer = WNConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2)
+            )
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels
+            ),
+            upsample_layer,
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=1,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=3,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=9,
+                use_snake=use_snake
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class OobleckEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=c_mults[0] * channels,
+                kernel_size=7,
+                padding=3
+            )
+        ]
+        for i in range(self.depth - 1):
+            layers += [
+                EncoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i + 1] * channels,
+                    stride=strides[i],
+                    use_snake=use_snake
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[-1] * channels
+            ),
+            WNConv1d(
+                in_channels=c_mults[-1] * channels,
+                out_channels=latent_dim,
+                kernel_size=3,
+                padding=1
+            )
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class OobleckDecoder(nn.Module):
+    def __init__(
+        self,
+        out_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False,
+        final_tanh=True
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=latent_dim,
+                out_channels=c_mults[-1] * channels,
+                kernel_size=7,
+                padding=3
+            ),
+        ]
+        for i in range(self.depth - 1, 0, -1):
+            layers += [
+                DecoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i - 1] * channels,
+                    stride=strides[i - 1],
+                    use_snake=use_snake,
+                    antialias_activation=antialias_activation,
+                    use_nearest_upsample=use_nearest_upsample
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[0] * channels
+            ),
+            WNConv1d(
+                in_channels=c_mults[0] * channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                padding=3,
+                bias=False
+            ),
+            nn.Tanh() if final_tanh else nn.Identity()
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class Bottleneck(nn.Module):
+    def __init__(self, is_discrete: bool = False):
+        super().__init__()
+        self.is_discrete = is_discrete
+    def encode(self, x, return_info=False, **kwargs):
+        raise NotImplementedError
+    def decode(self, x):
+        raise NotImplementedError
+@torch.jit.script
+def vae_sample(mean, scale) -> dict[str, torch.Tensor]:
+    stdev = nn.functional.softplus(scale) + 1e-4
+    var = stdev * stdev
+    logvar = torch.log(var)
+    latents = torch.randn_like(mean) * stdev + mean
+    kl = (mean*mean + var - logvar - 1).sum(1).mean()
+    return {"latents": latents, "kl": kl}
+class VAEBottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+    def encode(self,
+               x,
+               return_info=False,
+               **kwargs) -> dict[str, torch.Tensor] | torch.Tensor:
+        mean, scale = x.chunk(2, dim=1)
+        sampled = vae_sample(mean, scale)
+        if return_info:
+            return sampled["latents"], {"kl": sampled["kl"]}
+        else:
+            return sampled["latents"]
+    def decode(self, x):
+        return x
+def compute_mean_kernel(x, y):
+    kernel_input = (x[:, None] - y[None]).pow(2).mean(2) / x.shape[-1]
+    return torch.exp(-kernel_input).mean()
+class Pretransform(nn.Module):
+    def __init__(self, enable_grad, io_channels, is_discrete):
+        super().__init__()
+        self.is_discrete = is_discrete
+        self.io_channels = io_channels
+        self.encoded_channels = None
+        self.downsampling_ratio = None
+        self.enable_grad = enable_grad
+    def encode(self, x):
+        raise NotImplementedError
+    def decode(self, z):
+        raise NotImplementedError
+    def tokenize(self, x):
+        raise NotImplementedError
+    def decode_tokens(self, tokens):
+        raise NotImplementedError
+class StableVAE(LoadPretrainedBase, AutoEncoderBase):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        latent_dim,
+        downsampling_ratio,
+        sample_rate,
+        io_channels=2,
+        bottleneck: Bottleneck = None,
+        pretransform: Pretransform = None,
+        in_channels=None,
+        out_channels=None,
+        soft_clip=False,
+        pretrained_ckpt: str | Path = None
+    ):
+        LoadPretrainedBase.__init__(self)
+        AutoEncoderBase.__init__(
+            self,
+            downsampling_ratio=downsampling_ratio,
+            sample_rate=sample_rate,
+            latent_shape=(latent_dim, None)
+        )
+        self.latent_dim = latent_dim
+        self.io_channels = io_channels
+        self.in_channels = io_channels
+        self.out_channels = io_channels
+        self.min_length = self.downsampling_ratio
+        if in_channels is not None:
+            self.in_channels = in_channels
+        if out_channels is not None:
+            self.out_channels = out_channels
+        self.bottleneck = bottleneck
+        self.encoder = encoder
+        self.decoder = decoder
+        self.pretransform = pretransform
+        self.soft_clip = soft_clip
+        self.is_discrete = self.bottleneck is not None and self.bottleneck.is_discrete
+        self.remove_autoencoder_prefix_fn: Callable = remove_key_prefix_factory(
+            "autoencoder."
+        )
+        if pretrained_ckpt is not None:
+            self.load_pretrained(pretrained_ckpt)
+    def process_state_dict(self, model_dict, state_dict):
+        state_dict = state_dict["state_dict"]
+        state_dict = self.remove_autoencoder_prefix_fn(model_dict, state_dict)
+        return state_dict
+    def encode(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        z = self.encoder(waveform)
+        z = self.bottleneck.encode(z)
+        z_length = waveform_lengths // self.downsampling_ratio
+        z_mask = create_mask_from_length(z_length)
+        return z, z_mask
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        waveform = self.decoder(latents)
+        return waveform
+if __name__ == '__main__':
+    import hydra
+    from utils.config import generate_config_from_command_line_overrides
+    model_config = generate_config_from_command_line_overrides(
+        "configs/model/autoencoder/stable_vae.yaml"
+    )
+    autoencoder: StableVAE = hydra.utils.instantiate(model_config)
+    autoencoder.eval()
+    waveform, sr = torchaudio.load(
+        "/hpc_stor03/sjtu_home/xuenan.xu/workspace/singing_voice_synthesis/diffsinger/data/raw/opencpop/segments/wavs/2007000230.wav"
+    )
+    waveform = torchaudio.functional.resample(
+        waveform, sr, model_config["sample_rate"]
+    )
+    print("waveform: ", waveform.shape)
+    with torch.no_grad():
+        latent, latent_length = autoencoder.encode(
+            waveform, torch.as_tensor([waveform.shape[-1]])
+        )
+        print("latent: ", latent.shape)
+        reconstructed = autoencoder.decode(latent)
+        print("reconstructed: ", reconstructed.shape)
+    import soundfile as sf
+    sf.write(
+        "./reconstructed.wav",
+        reconstructed[0, 0].numpy(),
+        samplerate=model_config["sample_rate"]
+    )

models/common.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from pathlib import Path
+import torch
+import torch.nn as nn
+from utils.torch_utilities import load_pretrained_model, merge_matched_keys
+import warnings
+class LoadPretrainedBase(nn.Module):
+    def process_state_dict(
+        self, model_dict: dict[str, torch.Tensor],
+        state_dict: dict[str, torch.Tensor]
+    ):
+        """
+        Custom processing functions of each model that transforms `state_dict` loaded from
+        checkpoints to the state that can be used in `load_state_dict`.
+        Use `merge_mathced_keys` to update parameters with matched names and shapes by
+        default.
+        Args
+            model_dict:
+                The state dict of the current model, which is going to load pretrained parameters
+            state_dict:
+                A dictionary of parameters from a pre-trained model.
+            Returns:
+                dict[str, torch.Tensor]:
+                    The updated state dict, where parameters with matched keys and shape are
+                    updated with values in `state_dict`.
+        """
+        state_dict = merge_matched_keys(model_dict, state_dict)
+        return state_dict
+    def load_pretrained(self, ckpt_path: str | Path):
+        load_pretrained_model(
+            self, ckpt_path, state_dict_process_fn=self.process_state_dict
+        )
+class CountParamsBase(nn.Module):
+    def count_params(self):
+        num_params = 0
+        trainable_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+            if param.requires_grad:
+                trainable_params += param.numel()
+        return num_params, trainable_params
+class SaveTrainableParamsBase(nn.Module):
+    @property
+    def param_names_to_save(self):
+        names = []
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                names.append(name)
+        for name, _ in self.named_buffers():
+            names.append(name)
+        return names
+    def load_state_dict(self, state_dict, strict=True, assign=True):
+        print("State dict keys:", list(state_dict.keys()))
+        #for key in self.param_names_to_save:
+        #    if key not in state_dict:
+        #        raise Exception(
+        #            f"{key} not found in either pre-trained models (e.g. BERT)"
+        #            " or resumed checkpoints (e.g. epoch_40/model.pt)"
+        #        )
+        # 兼容 PyTorch/transformers 的 assign 参数
+        return super().load_state_dict(state_dict, strict=strict, assign=assign)

models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc ADDED Viewed

Binary file (3.51 kB). View file

models/content_encoder/__pycache__/content_encoder.cpython-310.pyc ADDED Viewed

Binary file (4.72 kB). View file

models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc ADDED Viewed

Binary file (4.62 kB). View file

models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc ADDED Viewed

Binary file (6.11 kB). View file

models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc ADDED Viewed

Binary file (6.12 kB). View file

models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc ADDED Viewed

Binary file (4.74 kB). View file

models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc ADDED Viewed

Binary file (4.69 kB). View file

models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc ADDED Viewed

Binary file (4.73 kB). View file

models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc ADDED Viewed

Binary file (5.01 kB). View file

models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc ADDED Viewed

Binary file (5 kB). View file

models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc ADDED Viewed

Binary file (4.87 kB). View file

models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc ADDED Viewed

Binary file (4.48 kB). View file

models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc ADDED Viewed

Binary file (4.59 kB). View file

models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc ADDED Viewed

Binary file (4.78 kB). View file

models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc ADDED Viewed

Binary file (4.82 kB). View file

models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc ADDED Viewed

Binary file (4.71 kB). View file

models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc ADDED Viewed

Binary file (4.72 kB). View file

models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc ADDED Viewed

Binary file (4.71 kB). View file

models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc ADDED Viewed

Binary file (4.58 kB). View file

models/content_encoder/__pycache__/content_test.cpython-310.pyc ADDED Viewed

Binary file (4.71 kB). View file

models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc ADDED Viewed

Binary file (4.73 kB). View file

models/content_encoder/__pycache__/text_encoder.cpython-310.pyc ADDED Viewed

Binary file (2.71 kB). View file

models/content_encoder/caption_encoder.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from typing import Any
+import torch
+import torch.nn as nn
+import random
+from utils.audiotime_event_merge import replace_event_synonyms
+def decode_data(line_onset_str, latent_length):
+    """
+    Extracts a timestamp matrix (event onset indices) from a formatted onset string.
+    Args:
+        line_onset_str (str): String containing event names and onset intervals,
+            formatted like "event1__start1-end1_start2-end2--event2__start1-end1".
+        latent_length (int): Length of the output matrix.
+    Returns:
+        line_onset_index (torch.Tensor): Matrix of shape [4, latent_length],
+        line_event (list): List of event names extracted from the onset string.
+    Notes:
+        - 24000 is the audio sample rate.
+        - 480 is the downsample ratio to align with VAE.
+        - Each onset interval "start-end" (in seconds) is converted to embedding indices via (time * 24000 / 480).
+    """
+    line_onset_index = torch.zeros((4, latent_length)) # max for 4 events
+    line_event = []
+    event_idx = 0
+    for event_onset in line_onset_str.split('--'):
+        #print(event_onset)
+        (event, instance) = event_onset.split('__')
+        #print(instance)
+        line_event.append(event)
+        for start_end in instance.split('_'):
+            (start, end) = start_end.split('-')
+            start, end = int(float(start)*24000/480), int(float(end)*24000/480)
+            if end > (latent_length - 1): break
+            line_onset_index[event_idx, start: end] = 1
+        event_idx = event_idx + 1
+    return line_onset_index, line_event
+class ContentEncoder(nn.Module):
+    """
+    ContentEncoder encodes TCC and TDC information.
+    """
+    def __init__(
+        self,
+        text_encoder: nn.Module= None,
+    ):
+        super().__init__()
+        self.text_encoder = text_encoder
+        self.pool = nn.AdaptiveAvgPool1d(1)
+    def encode_content(
+        self, batch_content: list[Any], device: str | torch.device
+    ):
+        batch_output = []
+        batch_mask = []
+        batch_onset = []
+        length_list = []
+        print(batch_content)
+        for content in batch_content:
+            caption = content["caption"]
+            onset = content["onset"]
+            length = int(float(content["length"]) *24000/480)
+                # Replacement for AudioTime
+            print(onset)
+            replace_label = content.get("replace_label", "False")
+            if replace_label == "True":
+                caption, onset = replace_event_synonyms(caption, onset)
+            # Handle random onset case for read data without timestamp
+            if content["onset"] == "random":
+                length_list.append(length)
+                """
+                fixed embedding. Actually it's a sick sentence, a error during training, kept to match the checkpoint.
+                You can change it to sentence that difference to captions in datasets.
+                The use of fixed text to obtain encoding is for numerical stability.
+                We attempted to use learnable unified encoding during training, but the results were not satisfactory.
+                """
+                event = "There is no event here"
+                event_embed = self.text_encoder([event.replace("_", " ")])["output"]
+                event_embed = self.pool(event_embed.permute(0, 2, 1))  # (B, 1024, 1)
+                event_embed = event_embed.flatten().unsqueeze(0)
+                new_onset = event_embed.repeat(length, 1).T
+            else:
+                onset_matrix, events = decode_data(onset, length)
+                length_list.append(length)
+                new_onset = torch.zeros((1024, length), device=device) # 1024 for T5
+                # TDC
+                for (idx, event) in enumerate(events):
+                    with torch.no_grad():
+                        event_embed = self.text_encoder([event.replace("_", " ")])["output"]
+                    event_embed = self.pool(event_embed.permute(0, 2, 1))  # (B, 1024, 1)
+                    event_embed = event_embed.flatten().unsqueeze(0)
+                    mask = (onset_matrix[idx, :] == 0)
+                    cols = mask.nonzero(as_tuple=True)[0]
+                    new_onset[:, cols] += event_embed.T.float()
+            # TCC
+            output_dict = self.text_encoder([caption])
+            batch_output.append(output_dict["output"][0])
+            batch_mask.append(output_dict["mask"][0])
+            batch_onset.append(new_onset)
+        # Pad all sequences in the batch to the same length for batching
+        batch_output = nn.utils.rnn.pad_sequence(
+            batch_output, batch_first=True, padding_value=0
+        )
+        batch_mask = nn.utils.rnn.pad_sequence(
+            batch_mask, batch_first=True, padding_value=False
+        )
+        batch_onset = nn.utils.rnn.pad_sequence(
+            batch_onset, batch_first=True, padding_value=0
+        )
+        return batch_output, batch_mask, batch_onset, length_list

models/content_encoder/text_encoder.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5EncoderModel
+from transformers.modeling_outputs import BaseModelOutput
+try:
+    import torch_npu
+    from torch_npu.contrib import transfer_to_npu
+    DEVICE_TYPE = "npu"
+except ModuleNotFoundError:
+    DEVICE_TYPE = "cuda"
+class TransformersTextEncoderBase(nn.Module):
+    """
+    Base class for text encoding using HuggingFace Transformers models.
+    """
+    def __init__(self, model_name: str):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+    def forward(
+        self,
+        text: list[str],
+    ):
+        device = self.model.device
+        batch = self.tokenizer(
+            text,
+            max_length=self.tokenizer.model_max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
+        input_ids = batch.input_ids.to(device)
+        attention_mask = batch.attention_mask.to(device)
+        output: BaseModelOutput = self.model(
+            input_ids=input_ids, attention_mask=attention_mask
+        )
+        output = output.last_hidden_state
+        mask = (attention_mask == 1).to(device)
+        return {"output": output, "mask": mask}
+class T5TextEncoder(TransformersTextEncoderBase):
+    """
+    Text encoder using T5 encoder model.
+    """
+    def __init__(self, model_name: str = "/mnt/petrelfs/zhengzihao/cache/google-flan-t5-large"):
+        nn.Module.__init__(self)
+        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
+        self.model = T5EncoderModel.from_pretrained(model_name)
+        for param in self.model.parameters():
+            param.requires_grad = False
+        self.eval()
+    def forward(
+        self,
+        text: list[str],
+    ):
+        with torch.no_grad(), torch.amp.autocast(
+            device_type=DEVICE_TYPE, enabled=False
+        ):
+            return super().forward(text)
+if __name__ == '__main__':
+    text_encoder = T5TextEncoder()
+    text = ["dog barking and cat moving"]
+    text_encoder.eval()
+    with torch.no_grad():
+        output = text_encoder(text)
+    print(output["output"].shape)
+    #print(output)

models/diffusion.py ADDED Viewed

	@@ -0,0 +1,398 @@

+from typing import Sequence
+import random
+from typing import Any
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import diffusers.schedulers as noise_schedulers
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils.torch_utils import randn_tensor
+import numpy as np
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from models.content_encoder.caption_encoder import ContentEncoder
+from models.common import LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase
+from utils.torch_utilities import (
+    create_alignment_path, create_mask_from_length, loss_with_mask,
+    trim_or_pad_length
+)
+class DiffusionMixin:
+    def __init__(
+        self,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = None,
+        classifier_free_guidance: bool = True,
+        cfg_drop_ratio: float = 0.2,
+    ) -> None:
+        self.noise_scheduler_name = noise_scheduler_name
+        self.snr_gamma = snr_gamma
+        self.classifier_free_guidance = classifier_free_guidance
+        self.cfg_drop_ratio = cfg_drop_ratio
+        self.noise_scheduler = noise_schedulers.DDIMScheduler.from_pretrained(
+            self.noise_scheduler_name, subfolder="scheduler"
+        )
+    def compute_snr(self, timesteps) -> torch.Tensor:
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod)**0.5
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device
+                                                    )[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+            device=timesteps.device
+        )[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
+                                                                          None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+        # Compute SNR.
+        snr = (alpha / sigma)**2
+        return snr
+    def get_timesteps(
+        self,
+        batch_size: int,
+        device: torch.device,
+        training: bool = True
+    ) -> torch.Tensor:
+        if training:
+            timesteps = torch.randint(
+                0,
+                self.noise_scheduler.config.num_train_timesteps,
+                (batch_size, ),
+                device=device
+            )
+        else:
+            # validation on half of the total timesteps
+            timesteps = (self.noise_scheduler.config.num_train_timesteps //
+                         2) * torch.ones((batch_size, ),
+                                         dtype=torch.int64,
+                                         device=device)
+        timesteps = timesteps.long()
+        return timesteps
+    def get_target(
+        self, latent: torch.Tensor, noise: torch.Tensor,
+        timesteps: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Get the target for loss depending on the prediction type
+        """
+        if self.noise_scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif self.noise_scheduler.config.prediction_type == "v_prediction":
+            target = self.noise_scheduler.get_velocity(
+                latent, noise, timesteps
+            )
+        else:
+            raise ValueError(
+                f"Unknown prediction type {self.noise_scheduler.config.prediction_type}"
+            )
+        return target
+    def loss_with_snr(
+        self, pred: torch.Tensor, target: torch.Tensor,
+        timesteps: torch.Tensor, mask: torch.Tensor
+    ) -> torch.Tensor:
+        if self.snr_gamma is None:
+            loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+            loss = loss_with_mask(loss, mask)
+        else:
+            # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+            # Adaptef from huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py
+            snr = self.compute_snr(timesteps)
+            mse_loss_weights = (
+                torch.stack([snr, self.snr_gamma * torch.ones_like(timesteps)],
+                            dim=1).min(dim=1)[0] / snr
+            )
+            loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+            loss = loss_with_mask(loss, mask, reduce=False) * mse_loss_weights
+            loss = loss.mean()
+        return loss
+class AudioDiffusion(
+    LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase,
+    DiffusionMixin
+):
+    """
+    Args:
+        autoencoder (AutoEncoderBase): Pretrained autoencoder module VAE(frozen).
+        content_encoder (ContentEncoder): Encodes TCC and TDC information.
+        backbone (nn.Module): Main denoising network.
+        frame_resolution (float): Resolution for audio frames.
+        noise_scheduler_name (str): Noise scheduler identifier.
+        snr_gamma (float, optional): SNR gamma for noise scheduler.
+        classifier_free_guidance (bool): Enable classifier-free guidance.
+        cfg_drop_ratio (float): Ratio for randomly dropping context for classifier-free guidance.
+    """
+    def __init__(
+        self,
+        autoencoder: AutoEncoderBase,
+        content_encoder: ContentEncoder,
+        backbone: nn.Module,
+        frame_resolution:float,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = None,
+        classifier_free_guidance: bool = True,
+        cfg_drop_ratio: float = 0.2,
+    ):
+        nn.Module.__init__(self)
+        DiffusionMixin.__init__(
+            self, noise_scheduler_name, snr_gamma, classifier_free_guidance, cfg_drop_ratio
+        )
+        self.autoencoder = autoencoder
+        # Freeze autoencoder parameters
+        for param in self.autoencoder.parameters():
+            param.requires_grad = False
+        self.content_encoder = content_encoder
+        self.backbone = backbone
+        self.frame_resolution = frame_resolution
+        self.dummy_param = nn.Parameter(torch.empty(0))
+    def forward(
+        self, content: list[Any], condition: list[Any], task: list[str],
+        waveform: torch.Tensor, waveform_lengths: torch.Tensor, **kwargs
+    ):
+        """
+        Training forward pass.
+        Args:
+            content (list[Any]): List of content dicts for each sample.
+            condition (list[Any]): Conditioning information (unused here).
+            task (list[str]): List of task types.
+            waveform (Tensor): Batch of waveform tensors.
+            waveform_lengths (Tensor): Lengths for each waveform sample.
+        Returns:
+            dict: Dictionary containing the diffusion loss.
+        """
+        device = self.dummy_param.device
+        num_train_timesteps = self.noise_scheduler.config.num_train_timesteps
+        self.noise_scheduler.set_timesteps(num_train_timesteps, device=device)
+        self.autoencoder.eval()
+        with torch.no_grad():
+            latent, latent_mask = self.autoencoder.encode(
+                waveform.unsqueeze(1), waveform_lengths
+            )
+        # content(non_time_aligned_content) for TCC and time_aligned_content for TDC
+        content, content_mask, onset, _= self.content_encoder.encode_content(
+            content, device=device
+        )
+        # prepare latent and diffusion-related noise
+        time_aligned_content = onset.permute(0,2,1)
+        if self.training and self.classifier_free_guidance:
+            mask_indices = [
+                k for k in range(len(waveform)) if random.random() < self.cfg_drop_ratio
+            ]
+            if len(mask_indices) > 0:
+                content[mask_indices] = 0
+                time_aligned_content[mask_indices] = 0
+        batch_size = latent.shape[0]
+        timesteps = self.get_timesteps(batch_size, device, self.training)
+        noise = torch.randn_like(latent)
+        noisy_latent = self.noise_scheduler.add_noise(latent, noise, timesteps)
+        target = self.get_target(latent, noise, timesteps)
+        # Denoising prediction
+        pred: torch.Tensor = self.backbone(
+            x=noisy_latent,
+            timesteps=timesteps,
+            time_aligned_context=time_aligned_content,
+            context=content,
+            x_mask=latent_mask,
+            context_mask=content_mask
+        )
+        pred = pred.transpose(1, self.autoencoder.time_dim)
+        target = target.transpose(1, self.autoencoder.time_dim)
+        diff_loss = self.loss_with_snr(pred, target, timesteps, latent_mask)
+        return {
+            "diff_loss": diff_loss,
+        }
+    @torch.no_grad()
+    def inference(
+        self,
+        content: list[Any],
+        num_steps: int = 20,
+        guidance_scale: float = 3.0,
+        guidance_rescale: float = 0.0,
+        disable_progress: bool = True,
+        num_samples_per_content: int = 1,
+        **kwargs
+    ):
+        """
+        Inference/generation method for audio diffusion.
+        Args:
+            content (list[Any]): List of content dicts.
+            scheduler (SchedulerMixin): Scheduler for timesteps and noise.
+            num_steps (int): Number of denoising steps.
+            guidance_scale (float): Classifier-free guidance scale.
+            guidance_rescale (float): Rescale factor for guidance.
+            disable_progress (bool): Disable progress bar.
+            num_samples_per_content (int): How many samples to generate per content.
+        Returns:
+            waveform (Tensor): Generated waveform.
+        """
+        device = self.dummy_param.device
+        classifier_free_guidance = guidance_scale > 1.0
+        batch_size = len(content) * num_samples_per_content
+        print(content)
+        if classifier_free_guidance:
+            content, content_mask, onset, length_list = self.encode_content_classifier_free(
+                content, num_samples_per_content
+            )
+        else:
+            content, content_mask, onset, length_list = self.content_encoder.encode_content(
+            content, device=device
+        )
+            content = content.repeat_interleave(num_samples_per_content, 0)
+            content_mask = content_mask.repeat_interleave(
+                num_samples_per_content, 0
+            )
+        self.noise_scheduler.set_timesteps(num_steps, device=device)
+        timesteps = self.noise_scheduler.timesteps
+        # prepare input latent and context for the backbone
+        shape = (batch_size, 128, onset.shape[2])  # 128 for StableVAE channels
+        time_aligned_content = onset.permute(0,2,1)
+        latent = randn_tensor(
+            shape, generator=None, device=device, dtype=content.dtype
+        )
+        # scale the initial noise by the standard deviation required by the scheduler
+        latent = latent * self.noise_scheduler.init_noise_sigma
+        latent_mask = torch.full((batch_size, onset.shape[2]), False, device=device)
+        for i, length in enumerate(length_list):
+        # Set latent mask True for valid time steps for each sample
+            latent_mask[i, :length] = True
+        num_warmup_steps = len(timesteps) - num_steps * self.noise_scheduler.order
+        progress_bar = tqdm(range(num_steps), disable=disable_progress)
+        if classifier_free_guidance:
+            uncond_time_aligned_content = torch.zeros_like(
+                time_aligned_content
+            )
+            time_aligned_content = torch.cat(
+                [uncond_time_aligned_content, time_aligned_content]
+            )
+            latent_mask = torch.cat(
+                [latent_mask, latent_mask.detach().clone()]
+            )
+        # iteratively denoising
+        for i, timestep in enumerate(timesteps):
+            latent_input = torch.cat(
+                [latent, latent]
+            ) if classifier_free_guidance else latent
+            latent_input = self.noise_scheduler.scale_model_input(latent_input, timestep)
+            noise_pred = self.backbone(
+                x=latent_input,
+                x_mask=latent_mask,
+                timesteps=timestep,
+                time_aligned_context=time_aligned_content,
+                context=content,
+                context_mask=content_mask,
+            )
+            if classifier_free_guidance:
+                noise_pred_uncond, noise_pred_content = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_content - noise_pred_uncond
+                )
+                if guidance_rescale != 0.0:
+                    noise_pred = self.rescale_cfg(
+                        noise_pred_content, noise_pred, guidance_rescale
+                    )
+            # compute the previous noisy sample x_t -> x_t-1
+            latent = self.noise_scheduler.step(noise_pred, timestep, latent).prev_sample
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
+                                           (i+1) % self.noise_scheduler.order == 0):
+                progress_bar.update(1)
+        #latent = latent.to(next(self.autoencoder.parameters()).device)
+        waveform = self.autoencoder.decode(latent)
+        return waveform
+    def encode_content_classifier_free(
+        self,
+        content: list[Any],
+        task: list[str],
+        num_samples_per_content: int = 1
+    ):
+        device = self.dummy_param.device
+        content, content_mask, onset, length_list = self.content_encoder.encode_content(
+            content, device=device
+        )
+        content = content.repeat_interleave(num_samples_per_content, 0)
+        content_mask = content_mask.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        # get unconditional embeddings for classifier free guidance
+        uncond_content = torch.zeros_like(content)
+        uncond_content_mask = content_mask.detach().clone()
+        uncond_content = uncond_content.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        uncond_content_mask = uncond_content_mask.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
+        content = torch.cat([uncond_content, content])
+        content_mask = torch.cat([uncond_content_mask, content_mask])
+        return content, content_mask, onset, length_list
+    def rescale_cfg(
+        self, pred_cond: torch.Tensor, pred_cfg: torch.Tensor,
+        guidance_rescale: float
+    ):
+        """
+        Rescale `pred_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+        Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+        """
+        std_cond = pred_cond.std(
+            dim=list(range(1, pred_cond.ndim)), keepdim=True
+        )
+        std_cfg = pred_cfg.std(dim=list(range(1, pred_cfg.ndim)), keepdim=True)
+        pred_rescaled = pred_cfg * (std_cond / std_cfg)
+        pred_cfg = guidance_rescale * pred_rescaled + (
+            1 - guidance_rescale
+        ) * pred_cfg

models/dit/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (7.7 kB). View file

models/dit/__pycache__/audio_dit.cpython-310.pyc ADDED Viewed

Binary file (8.31 kB). View file

models/dit/__pycache__/mask_dit.cpython-310.pyc ADDED Viewed

Binary file (14.6 kB). View file

models/dit/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (14 kB). View file

models/dit/__pycache__/rotary.cpython-310.pyc ADDED Viewed

Binary file (2.79 kB). View file

models/dit/__pycache__/span_mask.cpython-310.pyc ADDED Viewed

Binary file (4.75 kB). View file

models/dit/attention.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .rotary import RotaryEmbedding
+from .modules import RMSNorm
+if hasattr(nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+    b, i, j, device = q_shape[0], q_shape[-2], k_shape[-2], device
+    #print(q_mask)
+    q_mask = default(
+        q_mask, torch.ones((b, i), device=device, dtype=torch.bool)
+    )
+    k_mask = default(
+        k_mask, torch.ones((b, j), device=device, dtype=torch.bool)
+    )
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1'
+                         ) * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        rope_mode='none'
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        if context_dim is None:
+            self.cross_attn = False
+        else:
+            self.cross_attn = True
+        context_dim = dim if context_dim is None else context_dim
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+        if qk_norm is None:
+            self.norm_q = nn.Identity()
+            self.norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            self.norm_q = nn.LayerNorm(head_dim)
+            self.norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            self.norm_q = RMSNorm(head_dim)
+            self.norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        if self.cross_attn:
+            assert rope_mode == 'none'
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = self.rotary_c(
+                q=q[:, :, :extras, :], k=k[:, :, :extras, :]
+            )
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary
+            )
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(
+                attn, mask_binary
+            ) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def forward(self, x, context=None, context_mask=None, extras=0):
+        B, L, C = x.shape
+        if context is None:
+            context = x
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if context_mask is not None:
+            mask_binary = create_mask(
+                x.shape, context.shape, x.device, None, context_mask
+            )
+        else:
+            mask_binary = None
+        q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads)
+        k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads)
+        v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads)
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class JointAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        rope_mode='none'
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.to_qx, self.to_kx, self.to_vx = self._make_qkv_layers(
+            dim, qkv_bias
+        )
+        self.to_qc, self.to_kc, self.to_vc = self._make_qkv_layers(
+            dim, qkv_bias
+        )
+        self.norm_qx, self.norm_kx = self._make_norm_layers(qk_norm, head_dim)
+        self.norm_qc, self.norm_kc = self._make_norm_layers(qk_norm, head_dim)
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_x = nn.Linear(dim, dim)
+        self.proj_drop_x = nn.Dropout(proj_drop)
+        self.proj_c = nn.Linear(dim, dim)
+        self.proj_drop_c = nn.Dropout(proj_drop)
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _make_qkv_layers(self, dim, qkv_bias):
+        return (
+            nn.Linear(dim, dim,
+                      bias=qkv_bias), nn.Linear(dim, dim, bias=qkv_bias),
+            nn.Linear(dim, dim, bias=qkv_bias)
+        )
+    def _make_norm_layers(self, qk_norm, head_dim):
+        if qk_norm is None:
+            norm_q = nn.Identity()
+            norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            norm_q = nn.LayerNorm(head_dim)
+            norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            norm_q = RMSNorm(head_dim)
+            norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        return norm_q, norm_k
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = self.rotary_c(
+                q=q[:, :, :extras, :], k=k[:, :, :extras, :]
+            )
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary
+            )
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(
+                attn, mask_binary
+            ) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def _cat_mask(self, x, context, x_mask=None, context_mask=None):
+        B = x.shape[0]
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(
+                B, context.shape[-2], device=context.device
+            ).bool()
+        mask = torch.cat([context_mask, x_mask], dim=1)
+        return mask
+    def forward(self, x, context, x_mask=None, context_mask=None, extras=0):
+        B, Lx, C = x.shape
+        _, Lc, _ = context.shape
+        if x_mask is not None or context_mask is not None:
+            mask = self._cat_mask(
+                x, context, x_mask=x_mask, context_mask=context_mask
+            )
+            shape = [B, Lx + Lc, C]
+            mask_binary = create_mask(
+                q_shape=shape,
+                k_shape=shape,
+                device=x.device,
+                q_mask=None,
+                k_mask=mask
+            )
+        else:
+            mask_binary = None
+        qx, kx, vx = self.to_qx(x), self.to_kx(x), self.to_vx(x)
+        qc, kc, vc = self.to_qc(context), self.to_kc(context
+                                                    ), self.to_vc(context)
+        qx, kx, vx = map(
+            lambda t: einops.
+            rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads),
+            [qx, kx, vx]
+        )
+        qc, kc, vc = map(
+            lambda t: einops.
+            rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads),
+            [qc, kc, vc]
+        )
+        qx, kx = self.norm_qx(qx), self.norm_kx(kx)
+        qc, kc = self.norm_qc(qc), self.norm_kc(kc)
+        q, k, v = (
+            torch.cat([qc, qx],
+                      dim=2), torch.cat([kc, kx],
+                                        dim=2), torch.cat([vc, vx], dim=2)
+        )
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        context, x = x[:, :Lc, :], x[:, Lc:, :]
+        x = self.proj_x(x)
+        x = self.proj_drop_x(x)
+        context = self.proj_c(context)
+        context = self.proj_drop_c(context)
+        return x, context

models/dit/audio_diffsingernet_dit.py ADDED Viewed

	@@ -0,0 +1,520 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .mask_dit import DiTBlock, FinalBlock, UDiT
+from .modules import (
+    film_modulate,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    RMSNorm,
+)
+class AudioDiTBlock(DiTBlock):
+    """
+    A modified DiT block with time_aligned_context add to latent.
+    """
+    def __init__(
+        self,
+        dim,
+        time_aligned_context_dim,
+        dilation,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__(
+            dim=dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=skip,
+            skip_norm=skip_norm,
+            rope_mode=rope_mode,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        # time-aligned context projection
+        self.ta_context_projection = nn.Linear(
+            time_aligned_context_dim, 2 * dim
+        )
+        self.dilated_conv = nn.Conv1d(
+            dim, 2 * dim, kernel_size=3, padding=dilation, dilation=dilation
+        )
+    def forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+            )
+    def _forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            # TODO diffusion timestep input is not fused here
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+        # time-aligned context
+        time_aligned_context = self.ta_context_projection(time_aligned_context)
+        x = self.dilated_conv(x.transpose(1, 2)
+                             ).transpose(1, 2) + time_aligned_context
+        gate, filter = torch.chunk(x, 2, dim=-1)
+        x = torch.sigmoid(gate) * torch.tanh(filter)
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class AudioUDiT(UDiT):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        dilation_cycle_length=4,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        time_aligned_context_dim=768,
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        nn.Module.__init__(self)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        self.use_skip = skip
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        self.in_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                time_aligned_context_dim=time_aligned_context_dim,
+                dilation=2**(i % dilation_cycle_length),
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        self.mid_block = AudioDiTBlock(
+            dim=embed_dim,
+            time_aligned_context_dim=time_aligned_context_dim,
+            dilation=1,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.out_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                time_aligned_context_dim=time_aligned_context_dim,
+                dilation=2**(i % dilation_cycle_length),
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+    def forward(
+        self,
+        x,
+        timesteps,
+        time_aligned_context,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(
+            x=x,
+            time_aligned_context=time_aligned_context,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x

models/dit/audio_dit.py ADDED Viewed

	@@ -0,0 +1,549 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .mask_dit import DiTBlock, FinalBlock, UDiT
+from .modules import (
+    film_modulate,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    RMSNorm,
+)
+class AudioDiTBlock(DiTBlock):
+    """
+    A modified DiT block with time aligned context add to latent.
+    """
+    def __init__(
+        self,
+        dim,
+        ta_context_dim,
+        ta_context_norm=False,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        ta_context_fusion='add',
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__(
+            dim=dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=skip,
+            skip_norm=skip_norm,
+            rope_mode=rope_mode,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.ta_context_fusion = ta_context_fusion
+        self.ta_context_norm = ta_context_norm
+        if self.ta_context_fusion == "add":
+            self.ta_context_projection = nn.Linear(ta_context_dim, dim)
+            self.ta_context_norm = norm_layer(
+                ta_context_dim
+            ) if self.ta_context_norm else nn.Identity()
+        elif self.ta_context_fusion == "concat":
+            self.ta_context_projection = nn.Linear(ta_context_dim + dim, dim)
+            self.ta_context_norm = norm_layer(
+                ta_context_dim + dim
+            ) if self.ta_context_norm else nn.Identity()
+    def forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+            )
+    def _forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        # # time aligned context
+        # if self.ta_context_fusion == "add":
+        #     time_aligned_context = self.ta_context_projection(
+        #         self.ta_context_norm(time_aligned_context)
+        #     )
+        #     x = x + time_aligned_context
+        # elif self.ta_context_fusion == "concat":
+        #     cat = torch.cat([x, time_aligned_context], dim=-1)
+        #     cat = self.ta_context_norm(cat)
+        #     x = self.ta_context_projection(cat)
+        # skip connection
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        #print('skip')
+        #print(x)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            # TODO diffusion timestep input is not fused here
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+        # time aligned context fusion
+        if self.ta_context_fusion == "add":
+            time_aligned_context = self.ta_context_projection(
+                self.ta_context_norm(time_aligned_context)
+            )
+            x = x + time_aligned_context
+        elif self.ta_context_fusion == "concat":
+            cat = torch.cat([x, time_aligned_context], dim=-1)
+            cat = self.ta_context_norm(cat)
+            x = self.ta_context_projection(cat)
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class AudioUDiT(UDiT):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        ta_context_dim=768,
+        ta_context_fusion='concat',
+        ta_context_norm=True,
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        nn.Module.__init__(self)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        self.use_skip = skip
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        self.in_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                ta_context_dim=ta_context_dim,
+                ta_context_fusion=ta_context_fusion,
+                ta_context_norm=ta_context_norm,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        self.mid_block = AudioDiTBlock(
+            dim=embed_dim,
+            ta_context_dim=ta_context_dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            ta_context_fusion=ta_context_fusion,
+            ta_context_norm=ta_context_norm,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.out_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                ta_context_dim=ta_context_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                ta_context_fusion=ta_context_fusion,
+                ta_context_norm=ta_context_norm,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+    def forward(
+        self,
+        x,
+        timesteps,
+        time_aligned_context,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(
+            x=x,
+            time_aligned_context=time_aligned_context,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x

models/dit/mask_dit.py ADDED Viewed

	@@ -0,0 +1,823 @@

+import logging
+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .modules import (
+    film_modulate,
+    unpatchify,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    FeedForward,
+    RMSNorm,
+)
+from .span_mask import compute_mask_indices
+from .attention import Attention
+logger = logging.Logger(__file__)
+class AdaLN(nn.Module):
+    def __init__(self, dim, ada_mode='ada', r=None, alpha=None):
+        super().__init__()
+        self.ada_mode = ada_mode
+        self.scale_shift_table = None
+        if ada_mode == 'ada':
+            # move nn.silu outside
+            self.time_ada = nn.Linear(dim, 6 * dim, bias=True)
+        elif ada_mode == 'ada_single':
+            # adaln used in pixel-art alpha
+            self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        elif ada_mode in ['ada_solo', 'ada_sola_bias']:
+            self.lora_a = nn.Linear(dim, r * 6, bias=False)
+            self.lora_b = nn.Linear(r * 6, dim * 6, bias=False)
+            self.scaling = alpha / r
+            if ada_mode == 'ada_sola_bias':
+                # take bias out for consistency
+                self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        else:
+            raise NotImplementedError
+    def forward(self, time_token=None, time_ada=None):
+        if self.ada_mode == 'ada':
+            assert time_ada is None
+            B = time_token.shape[0]
+            time_ada = self.time_ada(time_token).reshape(B, 6, -1)
+        elif self.ada_mode == 'ada_single':
+            B = time_ada.shape[0]
+            time_ada = time_ada.reshape(B, 6, -1)
+            time_ada = self.scale_shift_table[None] + time_ada
+        elif self.ada_mode in ['ada_sola', 'ada_sola_bias']:
+            B = time_ada.shape[0]
+            time_ada_lora = self.lora_b(self.lora_a(time_token)) * self.scaling
+            time_ada = time_ada + time_ada_lora
+            time_ada = time_ada.reshape(B, 6, -1)
+            if self.scale_shift_table is not None:
+                time_ada = self.scale_shift_table[None] + time_ada
+        else:
+            raise NotImplementedError
+        return time_ada
+class DiTBlock(nn.Module):
+    """
+    A modified PixArt block with adaptive layer norm (adaLN-single) conditioning.
+    """
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim=dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            rope_mode=rope_mode
+        )
+        if context_dim is not None:
+            self.use_context = True
+            self.cross_attn = Attention(
+                dim=dim,
+                num_heads=num_heads,
+                context_dim=context_dim,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                rope_mode='none'
+            )
+            self.norm2 = norm_layer(dim)
+            if context_norm:
+                self.norm_context = norm_layer(context_dim)
+            else:
+                self.norm_context = nn.Identity()
+        else:
+            self.use_context = False
+        self.norm3 = norm_layer(dim)
+        self.mlp = FeedForward(
+            dim=dim, mult=mlp_ratio, activation_fn=act_layer, dropout=0
+        )
+        self.use_adanorm = True if time_fusion != 'token' else False
+        if self.use_adanorm:
+            self.adaln = AdaLN(
+                dim,
+                ada_mode=time_fusion,
+                r=ada_sola_rank,
+                alpha=ada_sola_alpha
+            )
+        if skip:
+            self.skip_norm = norm_layer(2 *
+                                        dim) if skip_norm else nn.Identity()
+            self.skip_linear = nn.Linear(2 * dim, dim)
+        else:
+            self.skip_linear = None
+        self.use_checkpoint = use_checkpoint
+    def forward(
+        self,
+        x,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x, time_token, time_ada, skip, context, x_mask, context_mask,
+                extras
+            )
+    def _forward(
+        self,
+        x,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class FinalBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        patch_size,
+        in_chans,
+        img_size,
+        input_type='2d',
+        norm_layer=nn.LayerNorm,
+        use_conv=True,
+        use_adanorm=True
+    ):
+        super().__init__()
+        self.in_chans = in_chans
+        self.img_size = img_size
+        self.input_type = input_type
+        self.norm = norm_layer(embed_dim)
+        if use_adanorm:
+            self.use_adanorm = True
+        else:
+            self.use_adanorm = False
+        if input_type == '2d':
+            self.patch_dim = patch_size**2 * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv2d(
+                    self.in_chans, self.in_chans, 3, padding=1
+                )
+            else:
+                self.final_layer = nn.Identity()
+        elif input_type == '1d':
+            self.patch_dim = patch_size * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv1d(
+                    self.in_chans, self.in_chans, 3, padding=1
+                )
+            else:
+                self.final_layer = nn.Identity()
+    def forward(self, x, time_ada=None, extras=0):
+        B, T, C = x.shape
+        x = x[:, extras:, :]
+        # only handle generation target
+        if self.use_adanorm:
+            shift, scale = time_ada.reshape(B, 2, -1).chunk(2, dim=1)
+            x = film_modulate(self.norm(x), shift, scale)
+        else:
+            x = self.norm(x)
+        x = self.linear(x)
+        x = unpatchify(x, self.in_chans, self.input_type, self.img_size)
+        x = self.final_layer(x)
+        return x
+class UDiT(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        # time fusion ada or token
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        # max length is only used for concat
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+        logger.info(f'x position embedding: {pe_method}')
+        logger.info(f'rope mode: {self.rope}')
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        logger.info(f'time fusion mode: {self.time_fusion}')
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        logger.info(f'context fusion mode: {context_fusion}')
+        logger.info(f'context position embedding: {context_pe_method}')
+        self.use_skip = skip
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        logger.info(f'use long skip connection: {skip}')
+        self.in_blocks = nn.ModuleList([
+            DiTBlock(
+                dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for _ in range(depth // 2)
+        ])
+        self.mid_block = DiTBlock(
+            dim=embed_dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.out_blocks = nn.ModuleList([
+            DiTBlock(
+                dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for _ in range(depth // 2)
+        ])
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+    def _init_ada(self):
+        if self.time_fusion == 'ada':
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.weight, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+        elif self.time_fusion == 'ada_single':
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+        elif self.time_fusion in ['ada_sola', 'ada_sola_bias']:
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.kaiming_uniform_(
+                    block.adaln.lora_a.weight, a=math.sqrt(5)
+                )
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+            nn.init.kaiming_uniform_(
+                self.mid_block.adaln.lora_a.weight, a=math.sqrt(5)
+            )
+            nn.init.constant_(self.mid_block.adaln.lora_b.weight, 0)
+            for block in self.out_blocks:
+                nn.init.kaiming_uniform_(
+                    block.adaln.lora_a.weight, a=math.sqrt(5)
+                )
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+    def initialize_weights(self):
+        # Basic init for all layers
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # init patch Conv like Linear
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+        # Zero-out AdaLN
+        if self.use_adanorm:
+            self._init_ada()
+        # Zero-out Cross Attention
+        if self.context_cross:
+            for block in self.in_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.weight, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+        # Zero-out cls embedding
+        if self.cls_embed:
+            if self.use_adanorm:
+                nn.init.constant_(self.cls_embed[-1].weight, 0)
+                nn.init.constant_(self.cls_embed[-1].bias, 0)
+        # Zero-out Output
+        # might not zero-out this when using v-prediction
+        # it could be good when using noise-prediction
+        # nn.init.constant_(self.final_block.linear.weight, 0)
+        # nn.init.constant_(self.final_block.linear.bias, 0)
+        # if self.use_conv:
+        #     nn.init.constant_(self.final_block.final_layer.weight.data, 0)
+        #     nn.init.constant_(self.final_block.final_layer.bias, 0)
+        # init out Conv
+        if self.use_conv:
+            nn.init.xavier_uniform_(self.final_block.final_layer.weight)
+            nn.init.constant_(self.final_block.final_layer.bias, 0)
+    def _concat_x_context(self, x, context, x_mask=None, context_mask=None):
+        assert context.shape[-2] == self.context_max_length
+        # Check if either x_mask or context_mask is provided
+        B = x.shape[0]
+        # Create default masks if they are not provided
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(
+                B, context.shape[-2], device=context.device
+            ).bool()
+        # Concatenate the masks along the second dimension (dim=1)
+        x_mask = torch.cat([context_mask, x_mask], dim=1)
+        # Concatenate context and x along the second dimension (dim=1)
+        x = torch.cat((context, x), dim=1)
+        return x, x_mask
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(
+            x=x,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+            x = blk(
+                x=x,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x
+class MaskDiT(nn.Module):
+    def __init__(
+        self,
+        model: UDiT,
+        mae=False,
+        mae_prob=0.5,
+        mask_ratio=[0.25, 1.0],
+        mask_span=10,
+    ):
+        super().__init__()
+        self.model = model
+        self.mae = mae
+        if self.mae:
+            out_channel = model.out_chans
+            self.mask_embed = nn.Parameter(torch.zeros((out_channel)))
+            self.mae_prob = mae_prob
+            self.mask_ratio = mask_ratio
+            self.mask_span = mask_span
+    def random_masking(self, gt, mask_ratios, mae_mask_infer=None):
+        B, D, L = gt.shape
+        if mae_mask_infer is None:
+            # mask = torch.rand(B, L).to(gt.device) < mask_ratios.unsqueeze(1)
+            mask_ratios = mask_ratios.cpu().numpy()
+            mask = compute_mask_indices(
+                shape=[B, L],
+                padding_mask=None,
+                mask_prob=mask_ratios,
+                mask_length=self.mask_span,
+                mask_type="static",
+                mask_other=0.0,
+                min_masks=1,
+                no_overlap=False,
+                min_space=0,
+            )
+            mask = mask.unsqueeze(1).expand_as(gt)
+        else:
+            mask = mae_mask_infer
+            mask = mask.expand_as(gt)
+        gt[mask] = self.mask_embed.view(1, D, 1).expand_as(gt)[mask]
+        return gt, mask.type_as(gt)
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        gt=None,
+        mae_mask_infer=None,
+        forward_model=True
+    ):
+        # todo: handle controlnet inside
+        mae_mask = torch.ones_like(x)
+        if self.mae:
+            if gt is not None:
+                B, D, L = gt.shape
+                mask_ratios = torch.FloatTensor(B).uniform_(*self.mask_ratio
+                                                           ).to(gt.device)
+                gt, mae_mask = self.random_masking(
+                    gt, mask_ratios, mae_mask_infer
+                )
+                # apply mae only to the selected batches
+                if mae_mask_infer is None:
+                    # determine mae batch
+                    mae_batch = torch.rand(B) < self.mae_prob
+                    gt[~mae_batch] = self.mask_embed.view(
+                        1, D, 1
+                    ).expand_as(gt)[~mae_batch]
+                    mae_mask[~mae_batch] = 1.0
+            else:
+                B, D, L = x.shape
+                gt = self.mask_embed.view(1, D, 1).expand_as(x)
+            x = torch.cat([x, gt, mae_mask[:, 0:1, :]], dim=1)
+        if forward_model:
+            x = self.model(
+                x=x,
+                timesteps=timesteps,
+                context=context,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                cls_token=cls_token
+            )
+            # logger.info(mae_mask[:, 0, :].sum(dim=-1))
+        return x, mae_mask

models/dit/modules.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+import math
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+def trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2*std) or (mean > b + 2*std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a-mean) / std)
+        u = norm_cdf((b-mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2*l - 1, 2*u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+# disable in checkpoint mode
+# @torch.jit.script
+def film_modulate(x, shift, scale):
+    return x * (1+scale) + shift
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) *
+        torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding,
+                               torch.zeros_like(embedding[:, :1])],
+                              dim=-1)
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(
+        self, hidden_size, frequency_embedding_size=256, out_size=None
+    ):
+        super().__init__()
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(
+            self.mlp[0].weight.dtype
+        )
+        t_emb = self.mlp(t_freq)
+        return t_emb
+def patchify(imgs, patch_size, input_type='2d'):
+    if input_type == '2d':
+        x = einops.rearrange(
+            imgs,
+            'B C (h p1) (w p2) -> B (h w) (p1 p2 C)',
+            p1=patch_size,
+            p2=patch_size
+        )
+    elif input_type == '1d':
+        x = einops.rearrange(imgs, 'B C (h p1) -> B h (p1 C)', p1=patch_size)
+    return x
+def unpatchify(x, channels=3, input_type='2d', img_size=None):
+    if input_type == '2d':
+        patch_size = int((x.shape[2] // channels)**0.5)
+        # h = w = int(x.shape[1] ** .5)
+        h, w = img_size[0] // patch_size, img_size[1] // patch_size
+        assert h * w == x.shape[1] and patch_size**2 * channels == x.shape[2]
+        x = einops.rearrange(
+            x,
+            'B (h w) (p1 p2 C) -> B C (h p1) (w p2)',
+            h=h,
+            p1=patch_size,
+            p2=patch_size
+        )
+    elif input_type == '1d':
+        patch_size = int((x.shape[2] // channels))
+        h = x.shape[1]
+        assert patch_size * channels == x.shape[2]
+        x = einops.rearrange(x, 'B h (p1 C) -> B C (h p1)', h=h, p1=patch_size)
+    return x
+class PatchEmbed(nn.Module):
+    """
+     Image to Patch Embedding
+    """
+    def __init__(self, patch_size, in_chans=3, embed_dim=768, input_type='2d'):
+        super().__init__()
+        self.patch_size = patch_size
+        self.input_type = input_type
+        if input_type == '2d':
+            self.proj = nn.Conv2d(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_size,
+                bias=True
+            )
+        elif input_type == '1d':
+            self.proj = nn.Conv1d(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_size,
+                bias=True
+            )
+    def forward(self, x):
+        if self.input_type == '2d':
+            B, C, H, W = x.shape
+            assert H % self.patch_size == 0 and W % self.patch_size == 0
+        elif self.input_type == '1d':
+            B, C, H = x.shape
+            assert H % self.patch_size == 0
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    """
+    Relative positional embedding used in HuBERT
+    """
+    def __init__(self, dim=768, kernel_size=128, groups=16):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            groups=groups,
+            bias=True
+        )
+        self.conv = nn.utils.parametrizations.weight_norm(
+            self.conv, name="weight", dim=2
+        )
+    def forward(self, x):
+        # B C T
+        x = self.conv(x)
+        x = F.gelu(x[:, :, :-1])
+        return x
+class SinusoidalPositionalEncoding(nn.Module):
+    def __init__(self, dim, length):
+        super(SinusoidalPositionalEncoding, self).__init__()
+        self.length = length
+        self.dim = dim
+        self.register_buffer(
+            'pe', self._generate_positional_encoding(length, dim)
+        )
+    def _generate_positional_encoding(self, length, dim):
+        pe = torch.zeros(length, dim)
+        position = torch.arange(0, length, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        return pe
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return x
+class PE_wrapper(nn.Module):
+    def __init__(self, dim=768, method='abs', length=None, **kwargs):
+        super().__init__()
+        self.method = method
+        if method == 'abs':
+            # init absolute pe like UViT
+            self.length = length
+            self.abs_pe = nn.Parameter(torch.zeros(1, length, dim))
+            trunc_normal_(self.abs_pe, std=.02)
+        elif method == 'conv':
+            self.conv_pe = PositionalConvEmbedding(dim=dim, **kwargs)
+        elif method == 'sinu':
+            self.sinu_pe = SinusoidalPositionalEncoding(dim=dim, length=length)
+        elif method == 'none':
+            # skip pe
+            self.id = nn.Identity()
+        else:
+            raise NotImplementedError
+    def forward(self, x):
+        if self.method == 'abs':
+            _, L, _ = x.shape
+            assert L <= self.length
+            x = x + self.abs_pe[:, :L, :]
+        elif self.method == 'conv':
+            x = x + self.conv_pe(x)
+        elif self.method == 'sinu':
+            x = self.sinu_pe(x)
+        elif self.method == 'none':
+            x = self.id(x)
+        else:
+            raise NotImplementedError
+        return x
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class GELU(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        approximate: str = "none",
+        bias: bool = True
+    ):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(
+            gate.to(dtype=torch.float32), approximate=self.approximate
+        ).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+# disable in checkpoint mode
+# @torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + beta * torch.sin(x * alpha).pow(2)
+class Snake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias, alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        x = self.proj(x)
+        x = snake_beta(x, self.alpha, self.beta)
+        return x
+class GESnake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias, alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        x = self.proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        return x * snake_beta(gate, self.alpha, self.beta)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out=None,
+        mult=4,
+        dropout=0.0,
+        activation_fn="geglu",
+        final_dropout=False,
+        inner_dim=None,
+        bias=True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "snake":
+            act_fn = Snake(dim, inner_dim, bias=bias)
+        elif activation_fn == "gesnake":
+            act_fn = GESnake(dim, inner_dim, bias=bias)
+        else:
+            raise NotImplementedError
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states

models/dit/rotary.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+"this rope is faster than llama rope with jit script"
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+# disable in checkpoint mode
+# @torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin):
+    # NOTE: This could probably be moved to Triton
+    # Handle a possible sequence length mismatch in between q and k
+    cos = cos[:, :, :x.shape[-2], :]
+    sin = sin[:, :, :x.shape[-2], :]
+    return (x*cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    """
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000**(torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=-2):
+        # expect input: B, H, L, D
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        # also make sure dtype wont change
+        if (
+            seq_len != self._seq_len_cached or
+            self._cos_cached.device != x.device or
+            self._cos_cached.dtype != x.dtype
+        ):
+            self._seq_len_cached = seq_len
+            t = torch.arange(
+                x.shape[seq_dimension], device=x.device, dtype=torch.float32
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype))
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+        return self._cos_cached, self._sin_cached
+    def forward(self, q, k):
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
+            q.float(), seq_dimension=-2
+        )
+        if k is not None:
+            return (
+                apply_rotary_pos_emb(
+                    q.float(), self._cos_cached, self._sin_cached
+                ).type_as(q),
+                apply_rotary_pos_emb(
+                    k.float(), self._cos_cached, self._sin_cached
+                ).type_as(k),
+            )
+        else:
+            return (
+                apply_rotary_pos_emb(
+                    q.float(), self._cos_cached, self._sin_cached
+                ).type_as(q), None
+            )

models/dit/span_mask.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import numpy as np
+import torch
+from typing import Optional, Tuple
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    # Convert mask_prob to a NumPy array
+    mask_prob = np.array(mask_prob)
+    # Calculate all_num_mask for each element in the batch
+    all_num_mask = np.floor(
+        mask_prob * all_sz / float(mask_length) + np.random.rand(bsz)
+    ).astype(int)
+    # Apply the max operation with min_masks for each element
+    all_num_mask = np.maximum(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length) + np.random.rand()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask[i]
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = np.random.randint(
+                mask_other, mask_length*2 + 1, size=num_mask
+            )
+        elif mask_type == "normal":
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = np.random.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (
+                        e - s if e - s >= length + min_space else 0
+                        for s, e in parts
+                    ),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+            mask_idc = np.asarray([
+                mask_idc[j] + offset for j in range(len(mask_idc))
+                for offset in range(lengths[j])
+            ])
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+    # min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        # if len(mask_idc) > min_len:
+        # mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return torch.tensor(mask)
+if __name__ == '__main__':
+    mask = compute_mask_indices(
+        shape=[4, 500],
+        padding_mask=None,
+        mask_prob=[0.65, 0.5, 0.65, 0.65],
+        mask_length=10,
+        mask_type="static",
+        mask_other=0.0,
+        min_masks=1,
+        no_overlap=False,
+        min_space=0,
+    )
+    print(mask)
+    print(mask.sum(dim=1))