Spaces:

1plus1
/

MangaColorization

Running

App Files Files Community

anhth commited on 13 days ago

Commit

8314c30

1 Parent(s): cc2b90a

Initial Commit

Browse files

Files changed (11) hide show

.gitignore +2 -0
app.py +120 -0
colorizer.py +420 -0
monarch_attn/__init__.py +1 -0
monarch_attn/ma_history.py +44 -0
monarch_attn/ma_torch.py +108 -0
monarch_attn/ma_triton.py +788 -0
monarch_attn/monarch_attention.py +66 -0
requirements.txt +5 -0
utils.py +29 -0
weights/colorizer.pth +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ *.pyc

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+from torchvision.transforms import v2
+import gradio as gr
+from PIL import Image
+from colorizer import ColorComicNet, MODEL_CFG
+from utils import smart_padding, remove_padding
+# Define the transformation pipeline for the input image
+TRANSFORM = v2.Compose([
+    v2.ToImage(),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Normalize(mean=[0.5], std=[0.5])
+])
+# Image preprocessing and postprocessing functions
+def preprocess_image(image: Image.Image, divisor=16):
+    """ Preprocess the input PIL image for the model. """
+    image = image.convert('RGB')
+    image_tensor = TRANSFORM(image).unsqueeze(0)  # Shape: (1, 3, H, W)
+    image_tensor, padding = smart_padding(image_tensor, divisor=divisor)
+    return image_tensor, padding
+def postprocess_output(output_tensor, padding):
+    """ Postprocess the model output tensor to a PIL image. """
+    output_tensor = remove_padding(output_tensor, padding)
+    output_tensor = (output_tensor + 1) / 2  # Scale back to [0, 1]
+    output_image = output_tensor.clamp(0, 1).squeeze(0).permute(1, 2, 0).numpy()  # Shape: (H, W, C)
+    return output_image
+# Define the colorization function
+def colorize_image(gray_image: Image.Image):
+    """ Colorize a single grayscale image using the model. """
+    with torch.no_grad():
+        # Preprocess
+        input_tensor, padding = preprocess_image(gray_image, divisor=64)
+        # Inference
+        output = model(input_tensor)
+        # Postprocess
+        output_image = postprocess_output(output, padding)
+        return output_image
+# Initialize the model
+model = ColorComicNet(MODEL_CFG)
+model.load_state_dict(torch.load("./weights/colorizer.pth", map_location=torch.device('cpu')))
+model.fuse()
+model.eval()
+# Create the Gradio interface
+custom_css = """
+body {
+    background: linear-gradient(135deg, #1e1e2f, #2a2a40);
+    color: white;
+}
+.gradio-container {
+    max-width: 1000px !important;
+    margin: auto;
+}
+.header {
+    text-align: center;
+    padding: 20px;
+}
+.header h1 {
+    font-size: 2.2rem;
+    margin-bottom: 5px;
+}
+.header p {
+    color: #cfcfe0;
+}
+.button-primary {
+    background: linear-gradient(90deg, #ff7a18, #ffb347);
+    border: none;
+    color: white;
+    font-weight: bold;
+}
+"""
+with gr.Blocks(css=custom_css) as demo:
+    # Header
+    with gr.Column(elem_classes="header"):
+        gr.Markdown("# 🎨 Comic Colorization")
+        gr.Markdown("Bring your grayscale comics to life with **ColorComicNet**")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=1):
+            input_image = gr.Image(
+                label="📥 Upload Grayscale Image",
+                type="pil",
+            )
+            colorize_button = gr.Button(
+                "✨ Colorize Image",
+                elem_classes="button-primary"
+            )
+        with gr.Column(scale=1):
+            output_image = gr.Image(
+                label="📤 Colorized Result",
+                type="numpy",
+            )
+    # Example section
+    # gr.Markdown("### 🖼️ Try an example")
+    # examples = gr.Examples(
+    #     examples=[
+    #         ["example1.png"],
+    #         ["example2.png"]
+    #     ],
+    #     inputs=input_image
+    # )
+    # Footer
+    gr.Markdown("---")
+    # Interaction
+    colorize_button.click(
+        fn=colorize_image,
+        inputs=input_image,
+        outputs=output_image
+    )
+demo.launch()

colorizer.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numbers
+from dataclasses import dataclass, asdict
+from einops import rearrange
+class UpSample(nn.Module):
+    """ UpSampling block using PixelShuffle """
+    def __init__(self, filters=64):
+        super().__init__()
+        self.conv = nn.Conv2d(filters, filters * 2, kernel_size=1, stride=1, padding=0, bias=True)
+        self.pixel_shuffle = nn.PixelShuffle(upscale_factor=2)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.pixel_shuffle(x)
+        return x
+## DownSampling block
+class DownSample(nn.Module):
+    """ DownSampling block using PixelUnshuffle """
+    def __init__(self, filters=64):
+        super().__init__()
+        self.conv = nn.Conv2d(filters, filters // 2, kernel_size=1, stride=1, padding=0, bias=True)
+        self.pixel_unshuffle = nn.PixelUnshuffle(downscale_factor=2)
+    def forward(self, x):
+        """ SHAPE (B, C, H, W) -> SHAPE (B, C/4, H/2, W/2) """
+        x = self.conv(x)
+        x = self.pixel_unshuffle(x)
+        return x
+# Custom LayerNormalization
+class BiasFree_LayerNorm(nn.Module):
+    """ Bias-Free Layer Normalization """
+    def __init__(self, normalized_shape):
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        x = x.contiguous()
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return x / torch.sqrt(sigma+1e-5) * self.weight
+class WithBias_LayerNorm(nn.Module):
+    """ With-Bias Layer Normalization """
+    def __init__(self, normalized_shape):
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        normalized_shape = torch.Size(normalized_shape)
+        assert len(normalized_shape) == 1
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        x = x.contiguous()
+        mu = x.mean(-1, keepdim=True)
+        sigma = x.var(-1, keepdim=True, unbiased=False)
+        return (x - mu) / torch.sqrt(sigma+1e-5) * self.weight + self.bias
+class LayerNorm(nn.Module):
+    """ Layer Normalization supporting two types: BiasFree and WithBias """
+    def __init__(self, dim, LayerNorm_type, out_4d=True):
+        super().__init__()
+        if LayerNorm_type =='BiasFree':
+            self.body = BiasFree_LayerNorm(dim)
+        else:
+            self.body = WithBias_LayerNorm(dim)
+        self.out_4d = out_4d
+    def to_3d(self, x):
+        # Convert (B, C, H, W) to (B, H*W, C)
+        if len(x.shape) == 3:
+            return x
+        elif len(x.shape) == 4:
+            return rearrange(x, 'b c h w -> b (h w) c')
+        else:
+            raise ValueError("Input must be a 3D or 4D tensor")
+    def to_4d(self, x, h, w):
+        # Convert (B, H*W, C) to (B, C, H, W)
+        if len(x.shape) == 4:
+            return x
+        elif len(x.shape) == 3:
+            return rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+        else:
+            raise ValueError("Input must be a 3D or 4D tensor")
+    def forward(self, x):
+        if self.out_4d:
+            h, w = x.shape[-2:]
+            return self.to_4d(self.body(self.to_3d(x)), h, w)
+        else:
+            return self.body(x)
+class RepConv3(nn.Module):
+    def __init__(self, in_channels, out_channels, groups, deploy=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.groups = groups
+        self.deploy = deploy
+        self.reparam = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, groups=groups)
+        if not deploy:
+            self.conv_3x3 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, groups=groups)
+            self.conv_1x1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, groups=groups)
+            self.conv_1x3 = nn.Conv2d(in_channels, out_channels, kernel_size=(1, 3), padding=(0, 1), groups=groups)
+            self.conv_3x1 = nn.Conv2d(in_channels, out_channels, kernel_size=(3, 1), padding=(1, 0), groups=groups)
+            self.conv_1x1_branch = nn.Conv2d(in_channels, in_channels, kernel_size=1, groups=groups, bias=False)
+            self.conv_3x3_branch = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, groups=groups, bias=False)
+        else:
+            self._delete_branches()
+    def _delete_branches(self):
+        for name in ['conv_3x3','conv_1x1','conv_1x3','conv_3x1', 'conv_1x1_branch', 'conv_3x3_branch']:
+            if hasattr(self, name):
+                delattr(self, name)
+    def fuse(self, delete_branches=True):
+        if self.deploy:
+            return
+        # Extract weights and biases
+        conv_3x3_w, conv_3x3_b = self.conv_3x3.weight, self.conv_3x3.bias
+        conv_1x1_w, conv_1x1_b = self.conv_1x1.weight, self.conv_1x1.bias
+        conv_1x3_w, conv_1x3_b = self.conv_1x3.weight, self.conv_1x3.bias
+        conv_3x1_w, conv_3x1_b = self.conv_3x1.weight, self.conv_3x1.bias
+        conv_1x1_branch_w, conv_3x3_branch_w = self.conv_1x1_branch.weight, self.conv_3x3_branch.weight
+        # Pad the smaller kernels to 3x3
+        conv_1x1_w_pad = F.pad(conv_1x1_w, [1, 1, 1, 1])
+        conv_1x3_w_pad = F.pad(conv_1x3_w, [0, 0, 1, 1])
+        conv_3x1_w_pad = F.pad(conv_3x1_w, [1, 1, 0, 0])
+        if self.groups == 1:
+            conv_1x1_3x3_w_pad = F.conv2d(conv_3x3_branch_w, conv_1x1_branch_w.permute(1, 0, 2, 3))
+        else:
+            w_slices = []
+            conv_1x1_branch_w_T = conv_1x1_branch_w.permute(1, 0, 2, 3)
+            in_channels_per_group = self.in_channels // self.groups
+            out_channels_per_group = self.out_channels // self.groups
+            for g in range(self.groups):
+                # Slice the transposed 1x1 weights for this group's channels
+                conv_1x1_branch_w_T_slice = conv_1x1_branch_w_T[:, g*in_channels_per_group:(g+1)*in_channels_per_group, :, :]
+                # Slice the 3x3 weights for this group's output channels
+                conv_3x3_branch_w_slice = conv_3x3_branch_w[g*out_channels_per_group:(g+1)*out_channels_per_group, :, :, :]
+                w_slices.append(F.conv2d(conv_3x3_branch_w_slice, conv_1x1_branch_w_T_slice))
+            conv_1x1_3x3_w_pad = torch.cat(w_slices, dim=0)
+        # Fuse weights and biases
+        conv_w = conv_3x3_w + conv_1x1_w_pad + conv_1x3_w_pad + conv_3x1_w_pad + conv_1x1_3x3_w_pad
+        if conv_3x3_b is None:
+            conv_3x3_b = torch.zeros(self.out_channels, device=conv_w.device)
+        conv_b = conv_3x3_b + conv_1x1_b + conv_1x3_b + conv_3x1_b
+        self.reparam.weight.data.copy_(conv_w)
+        self.reparam.bias.data.copy_(conv_b)
+        # Delete the original branches
+        if delete_branches:
+            self._delete_branches()
+        # Set deploy flag
+        self.deploy = True
+    def forward(self, x):
+        if self.deploy:
+            return self.reparam(x)
+        else:
+            return self.conv_3x3(x) + self.conv_1x1(x) + self.conv_1x3(x) + self.conv_3x1(x) + self.conv_3x3_branch(self.conv_1x1_branch(x))
+from monarch_attn import MonarchAttention
+@dataclass
+class RepAttnConfig:
+    dim: int
+    num_heads: int = 8
+    block_size: int = 16
+    num_steps: int = 2
+    pad_type: str = "pre"
+    impl: str = "torch"
+    deploy: bool = False
+class RepAttn(nn.Module):
+    """ Re-parameterizable Attention Block using MonarchAttention as the core attention mechanism."""
+    def __init__(self, dim, num_heads=8, block_size=14, num_steps=1, pad_type="pre", impl="torch", deploy=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Conv2d(dim, dim * 3, kernel_size=1)
+        self.monarch_attn = MonarchAttention(
+            block_size=block_size,
+            num_steps=num_steps,
+            pad_type=pad_type,
+            impl=impl
+        )
+        if deploy:
+            self.attn_fn = self.monarch_attn
+        else:
+            self.attn_fn = self.common_attn
+        self.proj = nn.Conv2d(dim, dim, kernel_size=1)
+        self.deploy = deploy
+    def common_attn(self, q, k, v):
+        """ Scaled Dot-Product Attention """
+        scale = (q.shape[-1]) ** -0.5
+        attn = (q @ k.transpose(-2, -1)) * scale
+        attn = attn.softmax(dim=-1)
+        out = attn @ v
+        return out
+    @torch.no_grad()
+    def fuse(self):
+        if not self.deploy:
+            self.attn_fn = self.monarch_attn
+            self.deploy = True
+    def forward(self, x):
+        B, C, H, W = x.shape
+        qkv = self.qkv(x)
+        q, k, v = torch.chunk(qkv, 3, dim=1)
+        q = rearrange(q, 'b (head c) h w -> b head c (h w)', head=self.num_heads)
+        k = rearrange(k, 'b (head c) h w -> b head c (h w)', head=self.num_heads)
+        v = rearrange(v, 'b (head c) h w -> b head c (h w)', head=self.num_heads)
+        attn_out = self.attn_fn(q, k, v)
+        attn_out = rearrange(attn_out, 'b head c (h w) -> b (head c) h w', head=self.num_heads, h=H, w=W)
+        out = self.proj(attn_out)
+        return out
+@dataclass
+class FFNConfig:
+    dim: int
+    expansion_factor: int = 1
+    deploy: bool = False
+class RepFFN(nn.Module):
+    def __init__(self, dim, expansion_factor=1, deploy=False):
+        super().__init__()
+        hidden_features = int(dim * expansion_factor)
+        self.project_in = RepConv3(dim, hidden_features, groups=1, deploy=deploy)
+        self.dwconv = RepConv3(hidden_features, hidden_features*2, groups=hidden_features, deploy=deploy)
+        self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1)
+    @torch.no_grad()
+    def fuse(self):
+        self.project_in.fuse()
+        self.dwconv.fuse()
+    def forward(self, x):
+        x = self.project_in(x)
+        x1, x2 = self.dwconv(x).chunk(2, dim=1)
+        x = F.gelu(x1) * x2
+        x = self.project_out(x)
+        return x
+class SkipConnection(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.Conv2d(dim*2, dim, kernel_size=1)
+    def forward(self, x1, x2):
+        x = torch.cat([x1, x2], dim=1)
+        x = self.conv(x)
+        return x
+class RepTransformerBlock(nn.Module):
+    def __init__(self, rep_attn_cfg: RepAttnConfig, ffn_cfg: FFNConfig, norm_type='WithBias'):
+        super().__init__()
+        self.rep_attn = RepAttn(**asdict(rep_attn_cfg))
+        self.rep_ffn = RepFFN(**asdict(ffn_cfg))
+        self.norm1 = LayerNorm(rep_attn_cfg.dim, norm_type)
+        self.norm2 = LayerNorm(rep_attn_cfg.dim, norm_type)
+    @torch.no_grad()
+    def fuse(self):
+        self.rep_attn.fuse()
+        self.rep_ffn.fuse()
+    def forward(self, x):
+        x = x + self.rep_attn(self.norm1(x))
+        x = x + self.rep_ffn(self.norm2(x))
+        return x
+class Block(nn.Module):
+    def __init__(self, num_block, rep_attn_cfg: RepAttnConfig, ffn_cfg: FFNConfig, norm_type='WithBias'):
+        super().__init__()
+        self.num_block = num_block
+        self.blocks = nn.ModuleList([
+            RepTransformerBlock(rep_attn_cfg, ffn_cfg, norm_type) for _ in range(num_block)
+        ])
+    @torch.no_grad()
+    def fuse(self):
+        for block in self.blocks:
+            block.fuse()
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        return x
+class ColorComicNet(nn.Module):
+    """ Main model implementation """
+    def __init__(self, input_shape=(3, 1024, 1024), output_channels=3, deploy=False, dims=[48, 96, 192, 384], num_blocks=[4, 6, 6, 8], num_heads=[1, 2, 2, 4], bias=True, last_act=None):
+        super().__init__()
+        assert len(dims) == len(num_blocks) == len(num_heads), "Length of dims, num_blocks and num_heads must be the same"
+        self.input_shape = input_shape
+        self.output_channels = output_channels
+        self.deploy = deploy
+        self.dims = dims
+        self.num_blocks = num_blocks
+        self.bias = bias
+        self.num_heads = num_heads
+        # Extractor
+        self.stem = nn.Conv2d(input_shape[0], dims[0], kernel_size=7, stride=4, padding=3, bias=bias)
+        # Encoder
+        layers = []
+        down_convs = []
+        for idx in range(len(dims)):
+            attn_cfg, ffn_cfg = self.build_cfg(dims[idx], num_heads[idx])
+            block = Block(num_blocks[idx], attn_cfg, ffn_cfg, norm_type='WithBias')
+            if idx < len(dims) - 1:
+                down_convs.append(DownSample(dims[idx]))
+            layers.append(block)
+        self.bottleneck = layers[-1] # Last encoder layer as bottleneck
+        self.encoder = nn.ModuleList(layers[:-1])
+        self.downsample = nn.ModuleList(down_convs)
+        # Decoder
+        layers = []
+        up_convs = []
+        skip_connections = []
+        for idx in range(len(dims)-2, -1, -1):
+            attn_cfg, ffn_cfg = self.build_cfg(dims[idx], num_heads[idx])
+            # print(f"Decoder layer {idx}: shape {l_shape}")
+            up_conv = UpSample(dims[idx+1])
+            block = Block(num_blocks[idx], attn_cfg, ffn_cfg, norm_type='WithBias')
+            layers.append(block)
+            up_convs.append(up_conv)
+            skip_connections.append(SkipConnection(dims[idx]))
+        self.decoder = nn.ModuleList(layers)
+        self.up_sample = nn.ModuleList(up_convs)
+        self.skip = nn.ModuleList(skip_connections)
+        # Head
+        self.head = nn.Sequential(
+            RepConv3(dims[0], dims[0]//2, 1, deploy=deploy),
+            nn.GELU(),
+            nn.Conv2d(dims[0]//2, output_channels, kernel_size=1, bias=bias),
+        )
+        self.last_act = last_act if last_act is not None else nn.Identity()
+    @torch.no_grad()
+    def fuse(self):
+        for block in self.encoder:
+            block.fuse()
+        self.bottleneck.fuse()
+        for block in self.decoder:
+            block.fuse()
+        for conv in self.head:
+            if isinstance(conv, RepConv3):
+                conv.fuse()
+    def build_cfg(self, dim, head):
+        # RepAttn config
+        attn_cfg = RepAttnConfig(
+            dim=dim,
+            num_heads=head,
+            block_size=12,
+            num_steps=2,
+            pad_type="pre",
+            impl="torch",
+            deploy=self.deploy
+        )
+        ## FFN config
+        ffn_cfg = FFNConfig(
+            dim=dim,
+            expansion_factor=1,
+        )
+        return attn_cfg, ffn_cfg
+    def forward(self, x):
+        """
+        x: (B, C, H, W)
+        """
+        res = x
+        x = self.stem(x)
+        feats = []
+        for blk, down in zip(self.encoder, self.downsample):
+            x = blk(x)
+            feats.append(x)
+            x = down(x)
+        x = self.bottleneck(x)
+        for blk, up, skip in zip(self.decoder, self.up_sample, self.skip):
+            x = up(x)
+            cur_feat = feats.pop()
+            x = skip(x, cur_feat)
+            x = blk(x)
+        x = F.interpolate(x, scale_factor=4, mode='bilinear')
+        x = self.head(x) + res
+        x = self.last_act(x)
+        return x
+# Example model configuration
+MODEL_CFG = {
+    'input_shape': (3, 512, 512),
+    'dims': [24, 48, 96, 192],
+    'num_blocks': [1, 2, 2, 4],
+    'num_heads': [1, 2, 4, 8],
+    'bias': True,
+    'last_act': nn.Tanh(),
+    'deploy': False
+}

monarch_attn/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .monarch_attention import MonarchAttention

monarch_attn/ma_history.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from math import sqrt
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+Tensor = torch.Tensor
+def monarch_matrix(L: Tensor, R: Tensor) -> Tensor:
+    out = torch.einsum("jkl,kji->ljki", L, R)
+    return rearrange(out, "l j k i -> (l j) (k i)")
+def monarch_attention_history(q: Tensor, k: Tensor, T: int, B: int) -> list[Tensor]:
+    N, D = q.shape
+    M = N // B
+    q = q / sqrt(D)
+    qb = rearrange(q, "(l j) v -> j l v", j=B)
+    kb = rearrange(k, "(k i) v -> k i v", i=B)
+    L = torch.stack(B * [torch.eye(M, device=q.device)])
+    history = []
+    # Alternating maximization for L, R
+    for t in range(T):
+        # R update
+        aR = torch.einsum("jkl,jlv->kjv", L, qb)
+        bR = torch.einsum("kjv,kiv->kji", aR, kb)
+        cR = torch.einsum("jkl->kj", L)
+        R = F.softmax(bR / cR[:, :, None], dim=2)
+        history.append(monarch_matrix(L, R))
+        # L update
+        aL = torch.einsum("kji,kiv->jkv", R, kb)
+        bL = torch.einsum("jkv,jlv->jkl", aL, qb)
+        cL = torch.einsum("kji->jk", R * torch.log(R))
+        L = F.softmax(bL - cL[:, :, None], dim=1)
+    return history

monarch_attn/ma_torch.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from math import sqrt
+import torch
+import torch.nn.functional as F
+Tensor = torch.Tensor
+xlogy = torch.special.xlogy
+def al_cl_ref(ar, k, cr, sm_scale, mask, eps=1e-12):
+    r_hat = sm_scale * (ar @ k.transpose(-1, -2)).to(torch.float)
+    r_hat = r_hat / (cr[..., :, None] + eps)
+    r_hat = r_hat + torch.where(mask[..., None, :], 0.0, -float("inf"))
+    r_hat = torch.exp(
+        r_hat - torch.clamp(torch.max(r_hat, dim=-1, keepdim=True).values, min=eps)
+    )
+    r = r_hat / (torch.sum(r_hat, dim=-1, keepdim=True) + eps)
+    r = torch.clamp(r, min=torch.finfo(r.dtype).tiny)
+    cl = torch.sum(xlogy(r, r), dim=-1).transpose(-1, -2)
+    al = sm_scale * (r.to(k.dtype) @ k).transpose(-2, -3)
+    return al, cl
+def ar_cr_ref(al, q, cl, mask_t):
+    l_hat = (al @ q.transpose(-1, -2)).to(torch.float)
+    l_hat = l_hat - cl[..., :, None]
+    l = F.softmax(l_hat, dim=-2)
+    l = mask_t[..., None, :] * l
+    cr = torch.sum(l, dim=-1).transpose(-1, -2)
+    ar = (l.to(q.dtype) @ q).transpose(-2, -3)
+    return ar, cr
+def al_y_cl_ref(ar, k, v, cr, sm_scale, mask, eps=1e-12):
+    r_hat = sm_scale * (ar @ k.transpose(-1, -2)).to(torch.float)
+    r_hat = r_hat / (cr[..., :, None] + eps)
+    r_hat = r_hat + torch.where(mask[..., None, :], 0.0, -float("inf"))
+    r_hat = torch.exp(
+        r_hat - torch.clamp(torch.max(r_hat, dim=-1, keepdim=True).values, min=eps)
+    )
+    r = r_hat / (torch.sum(r_hat, dim=-1, keepdim=True) + eps)
+    r = torch.clamp(r, min=torch.finfo(r.dtype).tiny)
+    cl = torch.sum(xlogy(r, r), dim=-1).transpose(-1, -2)
+    al = sm_scale * (r.to(k.dtype) @ k).transpose(-2, -3)
+    y = (r.to(v.dtype) @ v).transpose(-2, -3)
+    return al, y, cl
+def z_ref(al, q, cl, y):
+    l_hat = (q @ al.transpose(-1, -2)).to(torch.float)
+    l_hat = l_hat - cl[..., None, :]
+    l = F.softmax(l_hat, dim=-1)
+    z = (l.to(y.dtype) @ y).transpose(-2, -3).contiguous()
+    return z
+def monarch_attention_torch(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    attn_mask: Tensor | None,
+    T: int,
+    B: int,
+    pre_pad: bool,
+) -> Tensor:
+    E, H, N, D = q.shape
+    _, _, _, Dv = v.shape
+    M = (N + B - 1) // B
+    N_padded = M * B
+    sm_scale = 1 / sqrt(D)
+    pad_t = (N_padded - N, 0) if pre_pad else (0, N_padded - N)
+    pad_t_2d = (0, 0) + pad_t
+    q = F.pad(q, pad_t_2d).view(E, H, M, B, D)
+    k = F.pad(k, pad_t_2d).view(E, H, M, B, D)
+    v = F.pad(v, pad_t_2d).view(E, H, M, B, Dv)
+    ar = q
+    cr = torch.ones(E, H, M, B, device=q.device, dtype=torch.float)
+    q = q.transpose(-2, -3)
+    pad_offset = N_padded - N if pre_pad else 0
+    range_n = torch.arange(M * B).view(M, B).to(q.device)
+    mask = range_n >= pad_offset if pre_pad else range_n < N
+    if attn_mask is not None:
+        attn_mask = F.pad(attn_mask, pad_t).view(E, 1, M, B)
+        mask = torch.logical_and(mask, attn_mask)
+    for _ in range(T - 1):
+        al, cl = al_cl_ref(ar, k, cr, sm_scale, mask)
+        ar, cr = ar_cr_ref(al, q, cl, mask.mT)
+    al, y, cl = al_y_cl_ref(ar, k, v, cr, sm_scale, mask)
+    z = z_ref(al, q, cl, y)
+    z = z.view(E, H, N_padded, Dv)
+    return z[..., N_padded - N :, :] if pre_pad else z[..., :N, :]

monarch_attn/ma_triton.py ADDED Viewed

	@@ -0,0 +1,788 @@

+from math import sqrt
+import torch
+import triton
+import triton.language as tl
+Tensor = torch.Tensor
+@triton.jit
+def xlogx(x):
+    return tl.where(x == 0, 0.0, x * tl.log(x))
+@triton.jit
+def _al_cl_kernel(
+    ar_ptr,
+    stride_ar_e,
+    stride_ar_h,
+    stride_ar_m,
+    stride_ar_b,
+    stride_ar_d,
+    k_ptr,
+    stride_k_e,
+    stride_k_h,
+    stride_k_m,
+    stride_k_b,
+    stride_k_d,
+    cr_ptr,
+    stride_cr_e,
+    stride_cr_h,
+    stride_cr_m,
+    stride_cr_b,
+    al_ptr,
+    stride_al_e,
+    stride_al_h,
+    stride_al_m,
+    stride_al_b,
+    stride_al_d,
+    cl_ptr,
+    stride_cl_e,
+    stride_cl_h,
+    stride_cl_m,
+    stride_cl_b,
+    mask_ptr,
+    stride_mask_e,
+    stride_mask_m,
+    stride_mask_b,
+    H: int,
+    M: int,
+    B: int,
+    D: int,
+    N: int,
+    is_first_call: int,
+    sm_scale: float,
+    HAS_ATTN_MASK: tl.constexpr,
+    BLOCK_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    PRE_PAD: tl.constexpr,
+    EPS: tl.constexpr,
+):
+    idx_ehm = tl.program_id(0)
+    idx_eh = idx_ehm // M
+    idx_e = idx_eh // H
+    idx_h = idx_eh % H
+    idx_m = idx_ehm % M
+    pad_offset = M * B - N if PRE_PAD else 0
+    range_b = tl.arange(0, BLOCK_B)
+    range_d = tl.arange(0, BLOCK_D)
+    range_n = B * idx_m + range_b
+    mask_b = range_b < B
+    pad_mask_b = mask_b & ((range_n >= pad_offset) if PRE_PAD else (range_n < N))
+    k_mask_b = pad_mask_b
+    mask_d = range_d < D
+    if HAS_ATTN_MASK:
+        mask_block_ptr = (
+            mask_ptr
+            + stride_mask_e * idx_e
+            + stride_mask_m * idx_m
+            + stride_mask_b * (range_b - pad_offset)
+        )
+        valid_token_mask = tl.load(
+            mask_block_ptr,
+            mask=pad_mask_b,
+            other=0,
+        )
+        k_mask_b = pad_mask_b & valid_token_mask
+    # Load ar
+    ar_block_ptr = (
+        ar_ptr
+        + stride_ar_e * idx_e
+        + stride_ar_h * idx_h
+        + stride_ar_m * idx_m
+        + (
+            stride_ar_b * (range_b - (pad_offset if is_first_call else 0))[:, None]
+            + stride_ar_d * range_d[None, :]
+        )
+    )
+    ar = tl.load(
+        ar_block_ptr,
+        mask=(pad_mask_b if is_first_call else mask_b)[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Load k
+    k_block_ptr = (
+        k_ptr
+        + stride_k_e * idx_e
+        + stride_k_h * idx_h
+        + stride_k_m * idx_m
+        + (stride_k_b * (range_b - pad_offset)[:, None] + stride_k_d * range_d[None, :])
+    )
+    k = tl.load(
+        k_block_ptr,
+        mask=k_mask_b[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Load cr
+    cr_block_ptr = (
+        cr_ptr
+        + stride_cr_e * idx_e
+        + stride_cr_h * idx_h
+        + stride_cr_m * idx_m
+        + (stride_cr_b * range_b)
+    )
+    cr = tl.load(cr_block_ptr, mask=mask_b, other=1.0)
+    # Attention matrix
+    r = sm_scale * tl.dot(ar, tl.trans(k))
+    r = r / (cr[:, None] + EPS)
+    r = r + tl.where(k_mask_b[None, :], 0.0, float("-inf"))
+    r = tl.exp(r - tl.clamp(tl.max(r, axis=1, keep_dims=True), EPS, float("inf")))
+    r = r / (tl.sum(r, axis=1, keep_dims=True) + EPS)
+    # Store cl
+    cl = tl.sum(xlogx(r), axis=1)
+    cl_block_ptr = (
+        cl_ptr
+        + stride_cl_e * idx_e
+        + stride_cl_h * idx_h
+        + stride_cl_m * idx_m
+        + (stride_cl_b * range_b)
+    )
+    tl.store(cl_block_ptr, cl, mask=mask_b)
+    # Store al
+    al = (sm_scale * tl.dot(r.to(k.dtype), k)).to(ar.dtype)
+    al_block_ptr = (
+        al_ptr
+        + stride_al_e * idx_e
+        + stride_al_h * idx_h
+        + stride_al_m * idx_m
+        + (stride_al_b * range_b[:, None] + stride_al_d * range_d[None, :])
+    )
+    tl.store(
+        al_block_ptr,
+        al,
+        mask=mask_b[:, None] & mask_d[None, :],
+    )
+@triton.jit
+def _ar_cr_kernel(
+    al_ptr,
+    stride_al_e,
+    stride_al_h,
+    stride_al_m,
+    stride_al_b,
+    stride_al_d,
+    q_ptr,
+    stride_q_e,
+    stride_q_h,
+    stride_q_m,
+    stride_q_b,
+    stride_q_d,
+    cl_ptr,
+    stride_cl_e,
+    stride_cl_h,
+    stride_cl_m,
+    stride_cl_b,
+    ar_ptr,
+    stride_ar_e,
+    stride_ar_h,
+    stride_ar_m,
+    stride_ar_b,
+    stride_ar_d,
+    cr_ptr,
+    stride_cr_e,
+    stride_cr_h,
+    stride_cr_m,
+    stride_cr_b,
+    mask_ptr,
+    stride_mask_e,
+    stride_mask_m,
+    stride_mask_b,
+    H: int,
+    M: int,
+    B: int,
+    D: int,
+    N: int,
+    HAS_ATTN_MASK: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    PRE_PAD: tl.constexpr,
+):
+    idx_ehb = tl.program_id(0)
+    idx_eh = idx_ehb // B
+    idx_e = idx_eh // H
+    idx_h = idx_eh % H
+    idx_b = idx_ehb % B
+    pad_offset = M * B - N if PRE_PAD else 0
+    range_m = tl.arange(0, BLOCK_M)
+    range_d = tl.arange(0, BLOCK_D)
+    range_n = idx_b + B * range_m
+    mask_m = range_m < M
+    q_mask_m = mask_m & (range_n >= pad_offset if PRE_PAD else range_n < N)
+    mask_d = range_d < D
+    if HAS_ATTN_MASK:
+        mask_block_ptr = (
+            mask_ptr
+            + stride_mask_e * idx_e
+            + stride_mask_b * (idx_b - pad_offset)
+            + stride_mask_m * range_m
+        )
+        valid_token_mask = tl.load(
+            mask_block_ptr,
+            mask=q_mask_m,
+            other=0,
+        )
+        q_mask_m = q_mask_m & valid_token_mask
+    # Load al
+    al_block_ptr = (
+        al_ptr
+        + stride_al_e * idx_e
+        + stride_al_h * idx_h
+        + stride_al_b * idx_b
+        + (stride_al_m * range_m[:, None] + stride_al_d * range_d[None, :])
+    )
+    al = tl.load(
+        al_block_ptr,
+        mask=mask_m[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Load q
+    q_block_ptr = (
+        q_ptr
+        + stride_q_e * idx_e
+        + stride_q_h * idx_h
+        + stride_q_b * (idx_b - pad_offset)
+        + (stride_q_m * range_m[:, None] + stride_q_d * range_d[None, :])
+    )
+    q = tl.load(
+        q_block_ptr,
+        mask=q_mask_m[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Load cl
+    cl_block_ptr = (
+        cl_ptr
+        + stride_cl_e * idx_e
+        + stride_cl_h * idx_h
+        + stride_cl_b * idx_b
+        + (stride_cl_m * range_m)
+    )
+    cl = tl.load(cl_block_ptr, mask=mask_m, other=0.0)
+    # Attention matrix
+    l = tl.dot(al, tl.trans(q))
+    l = l - cl[:, None]
+    l = l + tl.where(mask_m[:, None], 0.0, float("-inf"))
+    l = tl.exp(l - tl.max(l, axis=0, keep_dims=True))
+    l = l / tl.sum(l, axis=0, keep_dims=True)
+    l = q_mask_m[None, :] * l
+    # Store cr
+    cr = tl.sum(l, axis=1)
+    cr_block_ptr = (
+        cr_ptr
+        + stride_cr_e * idx_e
+        + stride_cr_h * idx_h
+        + stride_cr_b * idx_b
+        + (stride_cr_m * range_m)
+    )
+    tl.store(cr_block_ptr, cr, mask=mask_m)
+    # Store ar
+    ar = tl.dot(l.to(q.dtype), q).to(al.dtype)
+    ar_block_ptr = (
+        ar_ptr
+        + stride_ar_e * idx_e
+        + stride_ar_h * idx_h
+        + stride_ar_b * idx_b
+        + (stride_ar_m * range_m[:, None] + stride_ar_d * range_d[None, :])
+    )
+    tl.store(
+        ar_block_ptr,
+        ar,
+        mask=mask_m[:, None] & mask_d[None, :],
+    )
+@triton.jit
+def _al_y_cl_kernel(
+    ar_ptr,
+    stride_ar_e,
+    stride_ar_h,
+    stride_ar_m,
+    stride_ar_b,
+    stride_ar_d,
+    k_ptr,
+    stride_k_e,
+    stride_k_h,
+    stride_k_m,
+    stride_k_b,
+    stride_k_d,
+    v_ptr,
+    stride_v_e,
+    stride_v_h,
+    stride_v_m,
+    stride_v_b,
+    stride_v_d,
+    cr_ptr,
+    stride_cr_e,
+    stride_cr_h,
+    stride_cr_m,
+    stride_cr_b,
+    al_ptr,
+    stride_al_e,
+    stride_al_h,
+    stride_al_m,
+    stride_al_b,
+    stride_al_d,
+    y_ptr,
+    stride_y_e,
+    stride_y_h,
+    stride_y_m,
+    stride_y_b,
+    stride_y_d,
+    cl_ptr,
+    stride_cl_e,
+    stride_cl_h,
+    stride_cl_m,
+    stride_cl_b,
+    mask_ptr,
+    stride_mask_e,
+    stride_mask_m,
+    stride_mask_b,
+    H: int,
+    M: int,
+    B: int,
+    D: int,
+    N: int,
+    is_first_call: int,
+    sm_scale: float,
+    HAS_ATTN_MASK: tl.constexpr,
+    BLOCK_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    PRE_PAD: tl.constexpr,
+    EPS: tl.constexpr,
+):
+    idx_ehm = tl.program_id(0)
+    idx_eh = idx_ehm // M
+    idx_e = idx_eh // H
+    idx_h = idx_eh % H
+    idx_m = idx_ehm % M
+    pad_offset = M * B - N if PRE_PAD else 0
+    range_b = tl.arange(0, BLOCK_B)
+    range_d = tl.arange(0, BLOCK_D)
+    range_n = B * idx_m + range_b
+    mask_b = range_b < B
+    pad_mask_b = mask_b & ((range_n >= pad_offset) if PRE_PAD else range_n < N)
+    k_mask_b = pad_mask_b
+    mask_d = range_d < D
+    if HAS_ATTN_MASK:
+        mask_block_ptr = (
+            mask_ptr
+            + stride_mask_e * idx_e
+            + stride_mask_m * idx_m
+            + stride_mask_b * (range_b - pad_offset)
+        )
+        valid_token_mask = tl.load(
+            mask_block_ptr,
+            mask=pad_mask_b,
+            other=0,
+        )
+        k_mask_b = pad_mask_b & valid_token_mask
+    # Load ar
+    ar_block_ptr = (
+        ar_ptr
+        + stride_ar_e * idx_e
+        + stride_ar_h * idx_h
+        + stride_ar_m * idx_m
+        + (
+            stride_ar_b * (range_b - (pad_offset if is_first_call else 0))[:, None]
+            + stride_ar_d * range_d[None, :]
+        )
+    )
+    ar = tl.load(
+        ar_block_ptr,
+        mask=(pad_mask_b if is_first_call else mask_b)[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Load k
+    k_block_ptr = (
+        k_ptr
+        + stride_k_e * idx_e
+        + stride_k_h * idx_h
+        + stride_k_m * idx_m
+        + (stride_k_b * (range_b - pad_offset)[:, None] + stride_k_d * range_d[None, :])
+    )
+    k = tl.load(
+        k_block_ptr,
+        mask=k_mask_b[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Load cr
+    cr_block_ptr = (
+        cr_ptr
+        + stride_cr_e * idx_e
+        + stride_cr_h * idx_h
+        + stride_cr_m * idx_m
+        + (stride_cr_b * range_b)
+    )
+    cr = tl.load(cr_block_ptr, mask=mask_b, other=1.0)
+    # Attention matrix
+    r = sm_scale * tl.dot(ar, tl.trans(k))
+    r = r / (cr[:, None] + EPS)
+    r = r + tl.where(k_mask_b[None, :], 0.0, float("-inf"))
+    r = tl.exp(r - tl.clamp(tl.max(r, axis=1, keep_dims=True), EPS, float("inf")))
+    r = r / (tl.sum(r, axis=1, keep_dims=True) + EPS)
+    # Store cl
+    cl = tl.sum(xlogx(r), axis=1)
+    cl_block_ptr = (
+        cl_ptr
+        + stride_cl_e * idx_e
+        + stride_cl_h * idx_h
+        + stride_cl_m * idx_m
+        + (stride_cl_b * range_b)
+    )
+    tl.store(cl_block_ptr, cl, mask=mask_b)
+    # Store al
+    al = (sm_scale * tl.dot(r.to(k.dtype), k)).to(ar.dtype)
+    al_block_ptr = (
+        al_ptr
+        + stride_al_e * idx_e
+        + stride_al_h * idx_h
+        + stride_al_m * idx_m
+        + (stride_al_b * range_b[:, None] + stride_al_d * range_d[None, :])
+    )
+    tl.store(
+        al_block_ptr,
+        al,
+        mask=mask_b[:, None] & mask_d[None, :],
+    )
+    # Load v
+    v_block_ptr = (
+        v_ptr
+        + stride_v_e * idx_e
+        + stride_v_h * idx_h
+        + stride_v_m * idx_m
+        + (stride_v_b * (range_b - pad_offset)[:, None] + stride_v_d * range_d[None, :])
+    )
+    v = tl.load(
+        v_block_ptr,
+        mask=k_mask_b[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Store y
+    y = tl.dot(r.to(v.dtype), v).to(ar.dtype)
+    y_block_ptr = (
+        y_ptr
+        + stride_y_e * idx_e
+        + stride_y_h * idx_h
+        + stride_y_m * idx_m
+        + (stride_y_b * range_b[:, None] + stride_y_d * range_d[None, :])
+    )
+    tl.store(
+        y_block_ptr,
+        y,
+        mask=mask_b[:, None] & mask_d[None, :],
+    )
+@triton.jit
+def _z_kernel(
+    al_ptr,
+    stride_al_e,
+    stride_al_h,
+    stride_al_m,
+    stride_al_b,
+    stride_al_d,
+    q_ptr,
+    stride_q_e,
+    stride_q_h,
+    stride_q_m,
+    stride_q_b,
+    stride_q_d,
+    y_ptr,
+    stride_y_e,
+    stride_y_h,
+    stride_y_m,
+    stride_y_b,
+    stride_y_d,
+    cl_ptr,
+    stride_cl_e,
+    stride_cl_h,
+    stride_cl_m,
+    stride_cl_b,
+    z_ptr,
+    stride_z_e,
+    stride_z_h,
+    stride_z_m,
+    stride_z_b,
+    stride_z_d,
+    H: int,
+    M: int,
+    B: int,
+    D: int,
+    N: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    PRE_PAD: tl.constexpr,
+):
+    idx_ehb = tl.program_id(0)
+    idx_eh = idx_ehb // B
+    idx_e = idx_eh // H
+    idx_h = idx_eh % H
+    idx_b = idx_ehb % B
+    pad_offset = M * B - N if PRE_PAD else 0
+    range_m = tl.arange(0, BLOCK_M)
+    range_d = tl.arange(0, BLOCK_D)
+    range_n = idx_b + B * range_m
+    mask_m = range_m < M
+    q_mask_m = mask_m & (range_n >= pad_offset if PRE_PAD else range_n < N)
+    mask_d = range_d < D
+    # Load al
+    al_block_ptr = (
+        al_ptr
+        + stride_al_e * idx_e
+        + stride_al_h * idx_h
+        + stride_al_b * idx_b
+        + (stride_al_m * range_m[:, None] + stride_al_d * range_d[None, :])
+    )
+    al = tl.load(
+        al_block_ptr,
+        mask=mask_m[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Load q
+    q_block_ptr = (
+        q_ptr
+        + stride_q_e * idx_e
+        + stride_q_h * idx_h
+        + stride_q_b * (idx_b - pad_offset)
+        + (stride_q_m * range_m[:, None] + stride_q_d * range_d[None, :])
+    )
+    q = tl.load(
+        q_block_ptr,
+        mask=q_mask_m[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Load cl
+    cl_block_ptr = (
+        cl_ptr
+        + stride_cl_e * idx_e
+        + stride_cl_h * idx_h
+        + stride_cl_b * idx_b
+        + (stride_cl_m * range_m)
+    )
+    cl = tl.load(cl_block_ptr, mask=mask_m, other=0.0)
+    # Attention matrix
+    l = tl.dot(q, tl.trans(al))
+    l = l - cl[None, :]
+    l = l + tl.where(mask_m[None, :], 0.0, float("-inf"))
+    l = tl.exp(l - tl.max(l, axis=1, keep_dims=True))
+    l = l / tl.sum(l, axis=1, keep_dims=True)
+    # Load y
+    y_block_ptr = (
+        y_ptr
+        + stride_y_e * idx_e
+        + stride_y_h * idx_h
+        + stride_y_b * idx_b
+        + (stride_y_m * range_m[:, None] + stride_y_d * range_d[None, :])
+    )
+    y = tl.load(
+        y_block_ptr,
+        mask=mask_m[:, None] & mask_d[None, :],
+        other=0.0,
+    )
+    # Store z
+    z = tl.dot(l.to(y.dtype), y).to(al.dtype)
+    z_block_ptr = (
+        z_ptr
+        + stride_z_e * idx_e
+        + stride_z_h * idx_h
+        + stride_z_b * (idx_b - pad_offset)
+        + (stride_z_m * range_m[:, None] + stride_z_d * range_d[None, :])
+    )
+    tl.store(
+        z_block_ptr,
+        z,
+        mask=q_mask_m[:, None] & mask_d[None, :],
+    )
+def monarch_attention_triton(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    attn_mask: Tensor | None,
+    T: int,
+    B: int,
+    pre_pad: bool,
+    eps: float = 0.0,
+) -> Tensor:
+    E, H, N, D = q.shape
+    M = triton.cdiv(N, B)
+    HMBDN = (H, M, B, D, N)
+    grid_ehm = (E * H * M,)
+    grid_ehb = (E * H * B,)
+    BLOCK_B = max(triton.next_power_of_2(B), 16)
+    BLOCK_M = max(triton.next_power_of_2(M), 16)
+    BLOCK_D = max(triton.next_power_of_2(D), 16)
+    sm_scale = 1 / sqrt(D)
+    q_strides = (q.stride(0), q.stride(1), B * q.stride(2), q.stride(2), q.stride(3))
+    k_strides = (k.stride(0), k.stride(1), B * k.stride(2), k.stride(2), k.stride(3))
+    v_strides = (v.stride(0), v.stride(1), B * v.stride(2), v.stride(2), v.stride(3))
+    ar = torch.empty(E, H, M, B, D, device=q.device, dtype=q.dtype)
+    al = torch.empty_like(ar)
+    ar_strides = (ar.stride(0), ar.stride(1), ar.stride(2), ar.stride(3), ar.stride(4))
+    al_strides = (al.stride(0), al.stride(1), al.stride(2), al.stride(3), al.stride(4))
+    cr = torch.ones(E, H, M, B, device=q.device, dtype=torch.float)
+    cl = torch.empty_like(cr)
+    cr_strides = (cr.stride(0), cr.stride(1), cr.stride(2), cr.stride(3))
+    cl_strides = (cl.stride(0), cl.stride(1), cl.stride(2), cl.stride(3))
+    attn_mask_strides = (
+        (attn_mask.stride(0), B * attn_mask.stride(1), attn_mask.stride(1))
+        if attn_mask is not None
+        else (0, 0, 0)
+    )
+    for t in range(T - 1):
+        is_first_call = t == 0
+        _ar = q if is_first_call else ar
+        _ar_strides = q_strides if is_first_call else ar_strides
+        _al_cl_kernel[grid_ehm](
+            _ar,
+            *_ar_strides,
+            k,
+            *k_strides,
+            cr,
+            *cr_strides,
+            al,
+            *al_strides,
+            cl,
+            *cl_strides,
+            attn_mask,
+            *attn_mask_strides,
+            *HMBDN,
+            is_first_call=is_first_call,
+            sm_scale=sm_scale,
+            HAS_ATTN_MASK=attn_mask is not None,  # type: ignore
+            BLOCK_B=BLOCK_B,  # type: ignore
+            BLOCK_D=BLOCK_D,  # type: ignore
+            PRE_PAD=pre_pad,  # type: ignore
+            EPS=eps,  # type: ignore
+        )
+        _ar_cr_kernel[grid_ehb](
+            al,
+            *al_strides,
+            q,
+            *q_strides,
+            cl,
+            *cl_strides,
+            ar,
+            *ar_strides,
+            cr,
+            *cr_strides,
+            attn_mask,
+            *attn_mask_strides,
+            *HMBDN,
+            HAS_ATTN_MASK=attn_mask is not None,  # type: ignore
+            BLOCK_M=BLOCK_M,  # type: ignore
+            BLOCK_D=BLOCK_D,  # type: ignore
+            PRE_PAD=pre_pad,  # type: ignore
+        )
+    y = torch.empty_like(al)
+    y_strides = (y.stride(0), y.stride(1), y.stride(2), y.stride(3), y.stride(4))
+    is_first_call_y = T == 1
+    _ar_y = q if is_first_call_y else ar
+    _ar_y_strides = q_strides if is_first_call_y else ar_strides
+    _al_y_cl_kernel[grid_ehm](
+        _ar_y,
+        *_ar_y_strides,
+        k,
+        *k_strides,
+        v,
+        *v_strides,
+        cr,
+        *cr_strides,
+        al,
+        *al_strides,
+        y,
+        *y_strides,
+        cl,
+        *cl_strides,
+        attn_mask,
+        *attn_mask_strides,
+        *HMBDN,
+        is_first_call=is_first_call_y,
+        sm_scale=sm_scale,
+        HAS_ATTN_MASK=attn_mask is not None,  # type: ignore
+        BLOCK_B=BLOCK_B,  # type: ignore
+        BLOCK_D=BLOCK_D,  # type: ignore
+        PRE_PAD=pre_pad,  # type: ignore
+        EPS=eps,  # type: ignore
+    )
+    z = torch.empty_like(v)
+    z_strides = (z.stride(0), z.stride(1), B * z.stride(2), z.stride(2), z.stride(3))
+    _z_kernel[grid_ehb](
+        al,
+        *al_strides,
+        q,
+        *q_strides,
+        y,
+        *y_strides,
+        cl,
+        *cl_strides,
+        z,
+        *z_strides,
+        *HMBDN,
+        BLOCK_M=BLOCK_M,  # type: ignore
+        BLOCK_D=BLOCK_D,  # type: ignore
+        PRE_PAD=pre_pad,  # type: ignore
+    )
+    return z

monarch_attn/monarch_attention.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from collections.abc import Callable
+from enum import StrEnum
+import torch
+import torch.nn as nn
+from .ma_torch import monarch_attention_torch
+Tensor = torch.Tensor
+MonarchAttentionFn = Callable[
+    [Tensor, Tensor, Tensor, Tensor | None, int, int, bool], Tensor
+]
+_IMPLEMENTATIONS: dict[str, MonarchAttentionFn] = {}
+def register_impl(name: str, fn: MonarchAttentionFn) -> None:
+    _IMPLEMENTATIONS[name] = fn
+register_impl("torch", monarch_attention_torch)
+try:
+    from .ma_triton import monarch_attention_triton
+    register_impl("triton", monarch_attention_triton)
+except ImportError:
+    pass
+class PadType(StrEnum):
+    pre = "pre"
+    post = "post"
+class MonarchAttention(nn.Module):
+    def __init__(self, block_size, num_steps, pad_type, impl="torch"):
+        super().__init__()
+        self.block_size = block_size
+        self.num_steps = num_steps
+        self.pad_type = pad_type
+        if impl not in _IMPLEMENTATIONS:
+            available = ", ".join(sorted(_IMPLEMENTATIONS))
+            raise ValueError(f"Unknown impl {impl!r}. Available: {available}")
+        self._impl_fn = _IMPLEMENTATIONS[impl]
+    def forward(self, query, key, value, attention_mask=None):
+        return self._impl_fn(
+            query,
+            key,
+            value,
+            attention_mask,
+            self.num_steps,
+            self.block_size,
+            self.pad_type == PadType.pre,
+        )
+    def get_matrix(self, query, key, attention_mask=None):
+        batch_size, num_heads, seq_len, _ = query.shape
+        value = torch.eye(seq_len, device=query.device).expand(
+            batch_size, num_heads, seq_len, seq_len
+        )
+        return self.forward(query, key, value, attention_mask)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+torchvision
+Pillow
+einops
+numpy

utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import torch.nn.functional as F
+from PIL import Image
+def smart_padding(image, divisor=16):
+    """ Pad the image so that its dimensions are divisible by the divisor. """
+    h, w = image.shape[-2:]
+    pad_h = (divisor - h % divisor) % divisor
+    pad_w = (divisor - w % divisor) % divisor
+    left = pad_w // 2
+    right = pad_w - left
+    top = pad_h // 2
+    bottom = pad_h - top
+    padding = (left, right, top, bottom)
+    padded_image = F.pad(image, padding, mode='constant', value=1.0)
+    return padded_image, padding
+def remove_padding(image, padding):
+    """ Remove the padding from the image. """
+    left, right, top, bottom = padding
+    if right == 0:
+        w_end = image.shape[-1]
+    else:
+        w_end = -right
+    if bottom == 0:
+        h_end = image.shape[-2]
+    else:
+        h_end = -bottom
+    return image[..., top:h_end, left:w_end]

weights/colorizer.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a8ed3573050cdd3fc9c9542ec9dd76e91143fd2052b1c379f87422c59ae60fc
+size 32434763