koichi12 commited on Feb 12, 2025

Commit

5ef2986

verified ·

1 Parent(s): c1012a5

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/xformers/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/__pycache__/_cpp_lib.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/__pycache__/_deprecation_warning.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/__pycache__/attn_bias_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/__pycache__/checkpoint.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/__pycache__/info.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/__pycache__/test.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/__pycache__/version.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/benchmarks/benchmark_mem_eff_attention.py +373 -0
.venv/lib/python3.11/site-packages/xformers/benchmarks/benchmark_sp24.py +178 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__init__.py +124 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/_sputnik_sparse.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/attention_mask.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/attention_patterns.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/base.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/compositional.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/core.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/favor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/fourier_mix.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/global_tokens.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/lambda_layer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/linformer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/local.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/nystrom.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/ortho.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/pooling.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/random.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/scaled_dot_product.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/sparsity_config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/visual.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/_sputnik_sparse.py +121 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/attention_mask.py +143 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/base.py +95 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/compositional.py +341 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/__init__.py +26 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/__pycache__/base.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/__pycache__/softmax.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/base.py +61 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/softmax.py +288 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/global_tokens.py +122 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/linformer.py +74 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/ortho.py +324 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/pooling.py +82 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/sparsity_config.py +812 -0
.venv/lib/python3.11/site-packages/xformers/components/attention/utils.py +108 -0
.venv/lib/python3.11/site-packages/xformers/components/feedforward/__init__.py +78 -0

.venv/lib/python3.11/site-packages/xformers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.66 kB). View file

.venv/lib/python3.11/site-packages/xformers/__pycache__/_cpp_lib.cpython-311.pyc ADDED Viewed

Binary file (8.38 kB). View file

.venv/lib/python3.11/site-packages/xformers/__pycache__/_deprecation_warning.cpython-311.pyc ADDED Viewed

Binary file (661 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/__pycache__/attn_bias_utils.cpython-311.pyc ADDED Viewed

Binary file (22.6 kB). View file

.venv/lib/python3.11/site-packages/xformers/__pycache__/checkpoint.cpython-311.pyc ADDED Viewed

Binary file (27.3 kB). View file

.venv/lib/python3.11/site-packages/xformers/__pycache__/info.cpython-311.pyc ADDED Viewed

Binary file (4.55 kB). View file

.venv/lib/python3.11/site-packages/xformers/__pycache__/test.cpython-311.pyc ADDED Viewed

Binary file (177 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (8.17 kB). View file

.venv/lib/python3.11/site-packages/xformers/__pycache__/version.cpython-311.pyc ADDED Viewed

Binary file (207 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/benchmarks/benchmark_mem_eff_attention.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+import random
+from functools import partial
+import torch
+from torch.utils import benchmark
+import xformers.ops
+import xformers.ops.fmha as fmha
+from xformers.attn_bias_utils import create_attn_bias, ref_attention
+from xformers.benchmarks.utils import benchmark_main_helper, create_argparser
+torch.backends.cuda.matmul.allow_tf32 = False
+min_run_time = 0.5
+device = torch.device("cuda")
+NUM_THREADS = [1] if device.type == "cuda" else [1, 40]
+VISION_SHAPES = [
+    # ViT
+    (384, 197, 1, 88),
+    (384, 197, 1, 80),
+    (384, 197, 1, 64),
+    (1024, 197, 1, 88),
+    (1024, 197, 1, 80),
+    (1024, 197, 1, 64),
+    # ViT-Huge
+    (32 * 16, 197, 1, 80),
+    (32, 197, 16, 80),
+    (32, 197, 16, 64),
+    (32, 197, 16, 128),
+    # ViT-Giant
+    (16 * 16, 197, 1, 88),
+    (16, 197, 16, 88),
+    (16, 197, 16, 64),
+    (16, 197, 16, 128),
+    # FB models
+    (1024, 82, 8, 64),
+    (150, 256, 16, 64),
+    (64, 256, 12, 64),
+    # Stable diffusion (https://github.com/huggingface/diffusers/pull/532)
+    (1, 4096, 16, 40),  # 512x512
+    (1, 16384, 16, 40),  # 1024x1024
+    (1, 4096, 16, 80),
+    (1, 16384, 16, 80),
+    # + bs4
+    (4, 4096, 16, 40),
+    (4, 16384, 16, 40),
+    (4, 4096, 16, 80),
+    (4, 16384, 16, 80),
+    # ParlAI model
+    (256, 4096, 16, 64),
+    # Zetta B M H K
+    (8, 2048, 20, 128),
+]
+LLM_SHAPES = [
+    # LLaMa 70b - mp=8/16
+    *sorted(itertools.product([1, 2], [2048, 4096, 8192], [4, 8], [128])),
+    *sorted(
+        itertools.product([16], [128, 512, 1024], [16], [16, 32, 64, 128, 160, 256])
+    ),
+]
+OPS = [
+    (xformers.ops.fmha.cutlass.FwOp, xformers.ops.fmha.cutlass.BwOp),
+    (xformers.ops.fmha.flash.FwOp, xformers.ops.fmha.flash.BwOp),
+    (xformers.ops.fmha.flash3.FwOp, xformers.ops.fmha.flash3.BwOp),
+    (xformers.ops.fmha.ck.FwOp, xformers.ops.fmha.ck.BwOp),
+]
+def product_dict(**kwargs):
+    keys = kwargs.keys()
+    vals = kwargs.values()
+    for instance in itertools.product(*vals):
+        yield dict(zip(keys, instance))
+VISION_CASES, LLM_CASES = [
+    list(
+        product_dict(
+            shape_q=SHAPES,
+            num_threads=NUM_THREADS,
+            dropout_p=[0.0],
+            attn_bias_cfg=[(type(None), False)],
+            dtype=[torch.half],
+        )
+    )
+    for SHAPES in (VISION_SHAPES, LLM_SHAPES)
+]
+# Add more cases with some variations
+for c in VISION_CASES.copy():
+    c = c.copy()
+    c.update(
+        random.Random(str(c["shape_q"])).choice(
+            [
+                {"dropout_p": 0.3},
+                {"attn_bias_cfg": (torch.Tensor, False)},
+                {"attn_bias_cfg": (torch.Tensor, True)},
+                {"dtype": torch.bfloat16},
+                {"dtype": torch.float},
+            ]
+        )
+    )
+    VISION_CASES.append(c)
+LLM_CASE_UPDATES = [
+    {"attn_bias_cfg": (torch.Tensor, True)},
+    {"attn_bias_cfg": (xformers.ops.LowerTriangularMask, False)},
+    *[
+        {
+            "attn_bias_cfg": (
+                xformers.ops.fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+                False,
+            ),
+            "Hkv": Hkv,
+            "dtype": torch.bfloat16,
+        }
+        for Hkv in [1, 2]
+    ],
+]
+for c in LLM_CASES.copy():
+    for update in LLM_CASE_UPDATES:
+        c = c.copy()
+        c.update(update)
+        LLM_CASES.append(c)
+CASES = VISION_CASES + LLM_CASES
+def create_tensors(shape_q, Hkv, dtype, requires_grad=False, packed=True):
+    stacked_shape = list(shape_q)  # B, M, H, K
+    Hq = shape_q[2]
+    stacked_dim = 2 if packed else 0
+    stacked_shape.insert(stacked_dim, 3)
+    qkv = torch.rand(
+        stacked_shape, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    q = torch.rand(shape_q, device=device, dtype=dtype, requires_grad=requires_grad)
+    shape_kv = (shape_q[0], shape_q[1], Hkv, shape_q[3])
+    k = (
+        torch.rand(shape_kv, device=device, dtype=dtype, requires_grad=requires_grad)
+        .reshape(shape_q[0], shape_q[1], 1, Hkv, shape_q[3])
+        .expand(shape_q[0], shape_q[1], Hq // Hkv, Hkv, shape_q[3])
+        .reshape(shape_q)
+    )
+    v = (
+        torch.rand(shape_kv, device=device, dtype=dtype, requires_grad=requires_grad)
+        .reshape(shape_q[0], shape_q[1], 1, Hkv, shape_q[3])
+        .expand(shape_q[0], shape_q[1], Hq // Hkv, Hkv, shape_q[3])
+        .reshape(shape_q)
+    )
+    return qkv, q, k, v
+def mem_eff_attention_fw(
+    shape_q,
+    num_threads: int,
+    attn_bias_cfg,
+    dropout_p,
+    dtype,
+    packed=True,
+    Hkv=None,
+):
+    B, M, Hq, K = shape_q
+    Hkv = Hkv or Hq
+    _, q, k, v = create_tensors(
+        shape_q,
+        Hkv,
+        dtype,
+        requires_grad=False,
+        packed=packed,
+    )
+    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
+    if attn_bias_requires_grad:
+        return
+    dtype_str = {
+        torch.bfloat16: "b16",
+        torch.half: "f16",
+        torch.float: "f32",
+    }[dtype]
+    sub_label = (
+        f"{dtype_str} {B}-{M}-{Hq}-{Hkv}-{K}, p={dropout_p}, "
+        f"BiasT={attn_bias_type.__name__}"
+    )
+    has_run = False
+    for fw_op, bw_op in OPS:
+        bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=Hq,
+            num_heads_groups=Hq // Hkv,
+            q_len=M,
+            kv_len=M,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt="BMHK",
+            op=fw_op,
+        )
+        inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+        if isinstance(
+            bias,
+            (
+                fmha.attn_bias.BlockDiagonalMask,
+                fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask,
+            ),
+        ):
+            q, k, v = [x.reshape([1, -1, *x.shape[2:]]) for x in [q, k, v]]
+        if not fw_op.supports(inp):
+            continue
+        yield benchmark.Timer(
+            stmt="fn(q, k, v, attn_bias, p)",
+            globals={
+                "q": q,
+                "k": k,
+                "v": v,
+                "attn_bias": inp.attn_bias,
+                "p": dropout_p,
+                "fn": partial(
+                    xformers.ops.memory_efficient_attention, op=(fw_op, bw_op)
+                ),
+            },
+            label=f"attention (attn_bias={attn_bias_type})",
+            description=fw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+        has_run = True
+    if not has_run:
+        return
+    yield benchmark.Timer(
+        stmt="fn(q, k, v, attn_bias, p)",
+        globals={
+            "q": q,
+            "k": k,
+            "v": v,
+            "attn_bias": inp.attn_bias,
+            "p": dropout_p,
+            "fn": ref_attention,
+        },
+        label=f"attention (attn_bias={attn_bias_type})",
+        description="eager",
+        sub_label=sub_label,
+        num_threads=num_threads,
+    )
+def mem_eff_attention_bw(
+    shape_q, num_threads: int, attn_bias_cfg, dropout_p, dtype, Hkv=None
+):
+    B, M, Hq, K = shape_q
+    Hkv = Hkv or Hq
+    _, q, k, v = create_tensors(
+        shape_q,
+        Hkv,
+        dtype,
+        requires_grad=True,
+    )
+    attn_bias_type, attn_bias_requires_grad = attn_bias_cfg
+    dtype_str = {
+        torch.bfloat16: "b16",
+        torch.half: "f16",
+        torch.float: "f32",
+    }[dtype]
+    sub_label = (
+        f"{dtype_str} {B}-{M}-{Hq}-{Hkv}-{K}, p={dropout_p}, "
+        f"BiasT={attn_bias_type.__name__}, BiasGrad={attn_bias_requires_grad}"
+    )
+    has_run = False
+    for fw_op, bw_op in OPS:
+        bias = create_attn_bias(
+            attn_bias_type,
+            batch_size=B,
+            num_heads=Hq,
+            num_heads_groups=Hq // Hkv,
+            q_len=M,
+            kv_len=M,
+            dtype=dtype,
+            device=device,
+            requires_grad=attn_bias_requires_grad,
+            fmt="BMHK",
+            op=bw_op,
+        )
+        inp = fmha.Inputs(query=q, key=k, value=v, attn_bias=bias, p=dropout_p)
+        if not fw_op.supports(inp) or not bw_op.supports(inp):
+            continue
+        has_run = True
+        out = xformers.ops.memory_efficient_attention(
+            inp.query, inp.key, inp.value, inp.attn_bias, inp.p, op=(fw_op, bw_op)
+        )
+        grad_benchmark = torch.ones_like(q)
+        yield benchmark.Timer(
+            stmt="out.backward(grad, retain_graph=True)",
+            globals={
+                "out": out,
+                "grad": grad_benchmark,
+            },
+            label=f"attention backward (attn_bias={attn_bias_type})",
+            description=bw_op.NAME,
+            sub_label=sub_label,
+            num_threads=num_threads,
+        )
+        del out
+    if not has_run:
+        return
+    yield benchmark.Timer(
+        stmt="out.backward(grad, retain_graph=True)",
+        globals={
+            "out": ref_attention(q, k, v, inp.attn_bias, dropout_p),
+            "grad": grad_benchmark,
+        },
+        label=f"attention backward (attn_bias={attn_bias_type})",
+        description="vanilla",
+        sub_label=sub_label,
+        num_threads=num_threads,
+    )
+def main():
+    arg_parser = create_argparser()
+    arg_parser.add_argument(
+        "--omit-forward",
+        action="store_true",
+        help="Do not run forward benchmarks",
+    )
+    arg_parser.add_argument(
+        "--omit-backward",
+        action="store_true",
+        help="Do not run backward benchmarks",
+    )
+    args = arg_parser.parse_args()
+    if not args.omit_forward:
+        benchmark_main_helper(
+            mem_eff_attention_fw,
+            CASES,
+            arg_parser=arg_parser,
+            min_run_time=min_run_time,
+        )
+    if not args.omit_backward:
+        benchmark_main_helper(
+            mem_eff_attention_bw,
+            CASES,
+            arg_parser=arg_parser,
+            min_run_time=min_run_time,
+        )
+if __name__ == "__main__":
+    main()

.venv/lib/python3.11/site-packages/xformers/benchmarks/benchmark_sp24.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+from torch import nn
+from utils import DTYPE2STR, benchmark_main_helper2, product_dict
+import xformers.ops as xops
+min_run_time = 0.5
+device = torch.device("cuda")
+CASES = list(
+    product_dict(
+        B_in_hidden_out_ft=[
+            (2048 * 8, 2048, 2048 * 3, 2048),
+            (2048, 5120, 5120 * 3, 5120),  # 13b
+            (1024, 8192, 8192 * 3, 8192),  # 30b
+            (2048, 8192, 8192 * 3, 8192),  # 30b
+            (2048 * 2, 8192, 8192 * 3, 8192),  # 30b
+            # DINO ViT-L: lg + sm crops (patch16)
+            (64 * 2 * (14 * 14 + 1) + 64 * 8 * (6 * 6 + 1), 1024, 1024 * 4, 1024),
+            # DINO ViT-g: lg + sm crops (patch16)
+            (
+                12 * 2 * (16 * 16 + 1 + 11) + 12 * 8 * (7 * 7 + 1 + 11),
+                1536,
+                1536 * 4,
+                1536,
+            ),
+        ],
+        dtype=[torch.half],
+        bias=[False],
+    )
+)
+class Mlp(nn.Module):
+    LINEAR_CLS = nn.Linear
+    def __init__(
+        self, B_in_hidden_out_ft: Tuple[int, int, int, int], dtype, bias: bool, bw: bool
+    ) -> None:
+        B, in_ft, hid_ft, out_ft = B_in_hidden_out_ft
+        super().__init__()
+        self.label = "mlp"
+        self.sub_label = (
+            f"{DTYPE2STR[dtype]} ({B},{in_ft},{hid_ft},{out_ft}){' b' if bias else ''}"
+        )
+        self.fc1 = self.LINEAR_CLS(in_ft, hid_ft, bias=bias)
+        self.act = nn.GELU()
+        self.fc2 = self.LINEAR_CLS(hid_ft, out_ft, bias=bias)
+        self.grad = torch.randn([B, out_ft], device="cuda", dtype=dtype)
+        self.input = torch.randn(
+            [B, in_ft], device="cuda", dtype=dtype, requires_grad=True
+        )
+        self.out = self.input
+        self.to("cuda").to(dtype)
+    def fw(self):
+        x = self.input
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        self.out = x
+    def bw(self):
+        self.out.backward(self.grad, retain_graph=True)
+class MlpDenseMask(Mlp):
+    def fw(self):
+        x = self.input
+        x = self.fc1(x)
+        mask = torch.ops.xformers.sparse24_largest_mask_2d(x)
+        x = mask * x
+        x = self.act(x)
+        x = self.fc2(x)
+        self.out = x
+class MlpAct24(Mlp):
+    def fw(self):
+        x = self.input
+        x = self.fc1(x)
+        x = xops.sparsify24(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        self.out = x
+class LinearW24(torch.nn.Linear):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        w_sparse = xops.sparsify24(
+            self.weight,
+            gradient="24dense",
+            backend="cusparselt",
+        )
+        return F.linear(input, w_sparse, self.bias)
+class MlpW24(Mlp):
+    LINEAR_CLS = LinearW24
+class MicrobenchmarkBase:
+    def __init__(
+        self, B_in_hidden_out_ft: Tuple[int, int, int, int], dtype, bias: bool, bw: bool
+    ) -> None:
+        B, in_ft, hid_ft, out_ft = B_in_hidden_out_ft
+        super().__init__()
+        self.label = "mlp"
+        self.sub_label = (
+            f"{DTYPE2STR[dtype]} ({B},{in_ft},{hid_ft},{out_ft}){' b' if bias else ''}"
+        )
+        self.input = torch.randn(
+            [B, in_ft], device="cuda", dtype=dtype, requires_grad=True
+        )
+        self.input_colMajor = self.input.t().contiguous().t()
+        self.input_sp = xops.sparsify24(self.input)
+    def bw(self) -> None:
+        return None
+class MicrobenchmarkSparsify24(MicrobenchmarkBase):
+    def fw(self) -> torch.Tensor:
+        xops.sparsify24(self.input)
+        return self.input
+class MicrobenchmarkSp24ApplyDense(MicrobenchmarkBase):
+    def fw(self) -> torch.Tensor:
+        xops.sparsify24_like(self.input, pattern=self.input_sp, out_dense=True)
+        return self.input
+class MicrobenchmarkSp24ApplyDenseT(MicrobenchmarkBase):
+    def fw(self) -> torch.Tensor:
+        xops.sparsify24_like(self.input_colMajor, pattern=self.input_sp, out_dense=True)
+        return self.input
+class MicrobenchmarkInputClone(MicrobenchmarkBase):
+    def fw(self) -> torch.Tensor:
+        self.input.clone()
+        return self.input
+functions = {
+    "act24": MlpAct24,
+    "dense": Mlp,
+    "w24": MlpW24,
+    "s24_inp_sparsify24": MicrobenchmarkSparsify24,
+    "s24_inp_apply_dense": MicrobenchmarkSp24ApplyDense,
+    "s24_inp_apply_dense_t": MicrobenchmarkSp24ApplyDenseT,
+    "s24_inp_clone": MicrobenchmarkInputClone,
+}
+benchmark_main_helper2(
+    "sp24_fw", fw=True, cases=CASES, functions=functions, min_run_time=min_run_time
+)
+benchmark_main_helper2(
+    "sp24_fwbw",
+    fw=True,
+    bw=True,
+    cases=CASES,
+    functions=functions,
+    min_run_time=min_run_time,
+)

.venv/lib/python3.11/site-packages/xformers/components/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, Set, Union
+import torch
+from xformers.utils import (
+    generate_matching_config,
+    get_registry_decorator,
+    import_all_modules,
+)
+from ._sputnik_sparse import SparseCS
+from .attention_mask import AttentionMask
+from .base import Attention, AttentionConfig  # noqa
+logger = logging.getLogger("xformers")
+# CREDITS: Classy Vision registry mechanism
+ATTENTION_REGISTRY: Dict[str, Any] = {}
+ATTENTION_CLASS_NAMES: Set[str] = set()
+# Arbitrary threshold for now,
+# in between dense and sparse matrix algorithms for the attention mechanism
+_DENSITY_THRESHOLD = 0.30  # noqa # from the sputnik paper, vs.
+_USE_SPUTNIK = True
+def build_attention(config: Union[Dict[str, Any], AttentionConfig]):
+    """Builds an attention from a config.
+    This assumes a 'name' key in the config which is used to determine what
+    attention class to instantiate. For instance, a config `{"name": "my_attention",
+    "foo": "bar"}` will find a class that was registered as "my_attention"
+    (see :func:`register_attention`) and call .from_config on it."""
+    if not isinstance(config, AttentionConfig):
+        try:
+            config_instance = generate_matching_config(
+                config, ATTENTION_REGISTRY[config["name"]].config
+            )
+        except KeyError as e:
+            name = config["name"]
+            logger.warning(f"{name} not available among {ATTENTION_REGISTRY.keys()}")
+            raise e
+    else:
+        config_instance = config
+    return ATTENTION_REGISTRY[config_instance.name].constructor.from_config(
+        config_instance
+    )
+"""Registers an Attention subclass.
+    This decorator allows xFormers to instantiate a subclass of Attention
+    from a configuration file, even if the class itself is not part of the
+    xFormers library. To use it, apply this decorator to an Attention
+    subclass, like this:
+    .. code-block:: python
+        @dataclass
+        class MyConfig:
+            ...
+        @register_attention('my_attention', MyConfig)
+        class MyAttention(Attention):
+            ...
+    To instantiate an attention from a configuration file, see :func:`build_attention`."""
+register_attention: Callable[[str, Any], Callable[[Any], Any]] = get_registry_decorator(
+    ATTENTION_REGISTRY, ATTENTION_CLASS_NAMES, Attention, AttentionConfig
+)
+def maybe_sparsify(matrix) -> Any:
+    # Sparsify if that makes sense
+    if torch.count_nonzero(matrix).item() / matrix.numel() > _DENSITY_THRESHOLD:
+        # If not sparse, then AttentionMask is the reference type
+        return AttentionMask.from_bool(matrix)
+    return sparsify(matrix)
+def sparsify(matrix):
+    if _USE_SPUTNIK:
+        return SparseCS(matrix)
+    return matrix.to_sparse()
+from .favor import FavorAttention  # noqa
+from .global_tokens import GlobalAttention  # noqa
+from .linformer import LinformerAttention  # noqa
+from .local import LocalAttention  # noqa
+from .nystrom import NystromAttention  # noqa
+from .ortho import OrthoFormerAttention  # noqa
+from .random import RandomAttention  # noqa
+from .scaled_dot_product import ScaledDotProduct  # noqa
+__all__ = [
+    "ScaledDotProduct",
+    "LocalAttention",
+    "LinformerAttention",
+    "NystromAttention",
+    "RandomAttention",
+    "OrthoFormerAttention",
+    "GlobalAttention",
+    "FavorAttention",
+    "Attention",
+    "AttentionMask",
+    "build_attention",
+    "register_attention",
+]
+# automatically import any Python files in the directory
+import_all_modules(str(Path(__file__).parent), "xformers.components.attention")

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (4.18 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/_sputnik_sparse.cpython-311.pyc ADDED Viewed

Binary file (7.3 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/attention_mask.cpython-311.pyc ADDED Viewed

Binary file (7.48 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/attention_patterns.cpython-311.pyc ADDED Viewed

Binary file (15.5 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (4.5 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/compositional.cpython-311.pyc ADDED Viewed

Binary file (14.2 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/core.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/favor.cpython-311.pyc ADDED Viewed

Binary file (7.42 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/fourier_mix.cpython-311.pyc ADDED Viewed

Binary file (2.14 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/global_tokens.cpython-311.pyc ADDED Viewed

Binary file (5.47 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/lambda_layer.cpython-311.pyc ADDED Viewed

Binary file (3.85 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/linformer.cpython-311.pyc ADDED Viewed

Binary file (4 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/local.cpython-311.pyc ADDED Viewed

Binary file (5.22 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/nystrom.cpython-311.pyc ADDED Viewed

Binary file (12.9 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/ortho.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/pooling.cpython-311.pyc ADDED Viewed

Binary file (3.26 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/random.cpython-311.pyc ADDED Viewed

Binary file (5.43 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/scaled_dot_product.cpython-311.pyc ADDED Viewed

Binary file (5.46 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/sparsity_config.cpython-311.pyc ADDED Viewed

Binary file (41.9 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.46 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/__pycache__/visual.cpython-311.pyc ADDED Viewed

Binary file (4.68 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/_sputnik_sparse.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from xformers.ops import masked_matmul
+from xformers.sparse import SparseCSRTensor
+# TODO: this is here for BC
+from xformers.sparse.utils import _csr_to_coo, _dense_to_sparse  # noqa: F401
+class SparseCS:
+    def __init__(self, matrix, device=None):
+        if device is None:
+            device = torch.device("cpu")
+        if matrix.ndim == 2:
+            matrix = matrix[None]
+        assert matrix.ndim == 3
+        self._mat = SparseCSRTensor.from_dense(matrix).to(device)
+    @property
+    def device(self):
+        return self._mat.device
+    @property
+    def ndim(self):
+        return self._mat.ndim
+    @property
+    def dtype(self):
+        return self._mat.dtype
+    @property
+    def is_sparse(self):
+        return True
+    @property
+    def shape(self):
+        return self._mat.shape[1:]
+    @property
+    def values(self):
+        return self._mat.values()
+    @property
+    def row_indices(self):
+        return self._mat._csr_row_indices
+    @property
+    def column_indices(self):
+        return self._mat._csr_column_indices
+    @property
+    def row_offsets(self):
+        return self._mat._csr_row_offsets
+    @property
+    def _transp_info(self):
+        return self._mat._csr_transp_info
+    @classmethod
+    def wrap(
+        cls, shape, values, row_indices, row_offsets, column_indices, _transp_info
+    ):
+        matrix = cls.__new__(cls)
+        _shape = (values.shape[0],) + shape
+        csr_matrix = SparseCSRTensor._wrap(
+            _shape, values, row_indices, row_offsets, column_indices, _transp_info
+        )
+        matrix._mat = csr_matrix
+        return matrix
+    @classmethod
+    def _wrap(cls, csr_matrix):
+        assert isinstance(csr_matrix, SparseCSRTensor)
+        matrix = cls.__new__(cls)
+        matrix._mat = csr_matrix
+        return matrix
+    def __mul__(self, other):
+        assert isinstance(other, (int, float))
+        return type(self)._wrap(self._mat * other)
+    def __add__(self, other):
+        assert isinstance(other, type(self))
+        return type(self)._wrap(self._mat + other._mat)
+    def matmul_with_mask(self, a, b):
+        return type(self)._wrap(masked_matmul(a, b, self._mat))
+    def softmax(self):
+        out = torch.nn.functional.softmax(self._mat, -1)
+        return type(self)._wrap(out)
+    def spmm(self, b):
+        out = torch.bmm(self._mat, b)
+        return out
+    def transpose(self):
+        out = torch.transpose(self._mat, -2, -1)
+        return type(self)._wrap(out)
+    def to(self, device):
+        assert isinstance(device, torch.device)
+        out = self._mat.to(device)
+        return type(self)._wrap(out)
+    def to_dense(self):
+        return self._mat.to_dense()
+    def logical_and(self, other: torch.Tensor):
+        assert not isinstance(other, SparseCS)
+        out = torch.logical_and(self._mat, other)
+        return type(self)._wrap(out)
+    def __and__(self, other):
+        return self.logical_and(other)

.venv/lib/python3.11/site-packages/xformers/components/attention/attention_mask.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional, Type, TypeVar
+import torch
+Self = TypeVar("Self", bound="AttentionMask")
+class AttentionMask:
+    """
+    Holds an attention mask, along with a couple of helpers and attributes.
+    .. note: this is an additive mask, meaning that coefficients which should be computed hold the '0.' value,
+        and coefficients which should be skipped hold the '-inf' value. Any other value is possible if the purpose
+        is to bias the attention computation for instance
+    .. note: the attention mask dimensions are expected to be `[batch, to_sequence, from_sequence]`,
+        `[to_sequence, from_sequence]`, or anything broadcastable in between
+    """
+    def __init__(self, additive_mask: torch.Tensor, is_causal: bool = False):
+        assert additive_mask.is_floating_point(), additive_mask.dtype
+        assert not additive_mask.requires_grad
+        if additive_mask.ndim == 2:
+            additive_mask = additive_mask.unsqueeze(0)
+        self.values = additive_mask
+        self.is_causal = is_causal
+        self.seq_len = additive_mask.shape[1]
+        self.to_seq_len = additive_mask.shape[0]
+    def to_bool(self) -> torch.Tensor:
+        """
+        .. warning: we assume here that True implies that the value should be computed
+        """
+        return self.values != float("-inf")
+    @classmethod
+    def from_bool(cls: Type[Self], x: torch.Tensor) -> Self:
+        """
+        Create an AttentionMask given a boolean pattern.
+        .. warning: we assume here that True implies that the value should be computed
+        """
+        assert x.dtype == torch.bool
+        additive_mask = torch.empty_like(x, dtype=torch.float, device=x.device)
+        additive_mask.masked_fill_(x, 0.0)
+        additive_mask.masked_fill_(~x, float("-inf"))
+        return cls(additive_mask)
+    @classmethod
+    def from_multiplicative(cls: Type[Self], x: torch.Tensor) -> Self:
+        """
+        Create an AttentionMask given a multiplicative attention mask.
+        """
+        assert not x.dtype == torch.bool
+        additive_mask = torch.empty_like(x, dtype=torch.float, device=x.device)
+        x = x.bool()
+        additive_mask.masked_fill_(x, 0.0)
+        additive_mask.masked_fill_(~x, float("-inf"))
+        return cls(additive_mask)
+    @classmethod
+    def make_causal(
+        cls: Type[Self],
+        seq_len: int,
+        to_seq_len: Optional[int] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> Self:
+        if not to_seq_len:
+            to_seq_len = seq_len
+        additive_mask = torch.triu(
+            torch.ones(seq_len, to_seq_len, device=device, dtype=dtype) * float("-inf"),
+            diagonal=1,
+        )
+        return cls(additive_mask=additive_mask, is_causal=True)
+    def make_crop(
+        self, seq_len: int, to_seq_len: Optional[int] = None
+    ) -> "AttentionMask":
+        """
+        Return a cropped attention mask, whose underlying tensor is a view of this one
+        """
+        if not to_seq_len:
+            to_seq_len = seq_len
+        return AttentionMask(
+            self.values[:, :seq_len, :to_seq_len], is_causal=self.is_causal
+        )
+    def __repr__(self):
+        return f"AttentionMask - causal {self.is_causal} - mask " + str(self.values)
+    @property
+    def device(self):
+        return self.values.device
+    @property
+    def is_sparse(self):
+        return False
+    @property
+    def ndim(self):
+        return len(self.values.shape)
+    @property
+    def dtype(self):
+        return self.values.dtype
+    @property
+    def shape(self):
+        return self.values.shape
+    def __add__(self, other):
+        return AttentionMask(self.values + other.values, is_causal=False)
+    def to(
+        self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None
+    ) -> "AttentionMask":
+        assert device is None or isinstance(device, torch.device)
+        assert dtype is None or isinstance(dtype, torch.dtype)
+        assert device is not None or dtype is not None
+        # Noop if we don't need to create another instance
+        if ((device and device == self.device) or not device) and (
+            (dtype and dtype == self.dtype) or not dtype
+        ):
+            return self
+        return AttentionMask(self.values.to(device=device, dtype=dtype), self.is_causal)

.venv/lib/python3.11/site-packages/xformers/components/attention/base.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABCMeta, abstractmethod
+from dataclasses import asdict, dataclass
+from typing import Optional, Type, TypeVar
+import torch
+import torch.nn as nn
+from xformers._deprecation_warning import deprecated_function
+from xformers.components.attention import AttentionMask
+@dataclass
+class AttentionConfig:
+    """Parameters required for all Attentions.
+    Can accept and store extra parameters.
+    """
+    name: str  # the registered name for this attention mechanism
+    dropout: float  # dropout probability
+Self = TypeVar("Self", bound="Attention")
+# Define the common interface, every attention block needs to derive from it
+class Attention(nn.Module, metaclass=ABCMeta):
+    r"""The base Attention mechanism, which is typically a sub-part of the multi-head attention"""
+    _causal_mask: Optional[AttentionMask] = None
+    @abstractmethod
+    def __init__(self, dropout: Optional[float] = None, *args, **kwargs):
+        super().__init__()
+        deprecated_function(self)
+        # Requires the inputs to be projected
+        self.requires_input_projection = True
+        # Whether the head dimension needs to be present (if not it can be folded into the batch dimension)
+        self.requires_head_dimension = False
+        # key padding mask and attention mask must be passed in as separate arguments instead of a merged attention mask
+        self.requires_separate_masks = False
+        # Requires that K and Q have the same sequence length
+        self.requires_same_k_q_dimensions = False
+        # Whether the attention owns the single head/multihead mechanism
+        # so that the MHA wrapper should skip it
+        self.requires_skip_multi_head = False
+        # This attention requires a context length which is squared, often due to 2D pooling
+        self.requires_squared_context = False
+        # Whether this attention mechanism supports attention masks
+        self.supports_attention_mask = True
+        self.supports_key_padding_mask = False
+    @classmethod
+    def from_config(cls: Type[Self], config: AttentionConfig) -> Self:
+        # Generate the class inputs from the config
+        fields = asdict(config)
+        # Skip all Nones so that default values are used
+        fields = {k: v for k, v in fields.items() if v is not None}
+        return cls(**fields)
+    @abstractmethod
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, *args, **kwargs
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    @staticmethod
+    def _maybe_pad_sequence(x: torch.Tensor, mask: torch.Tensor):
+        """
+        If the sequence is shorter than the mask, return a padded view
+        """
+        if x.shape[-2] != mask.shape[-1]:
+            assert x.shape[-2] < mask.shape[-1], (
+                "Sequence is bigger than the provided mask, cannot infer what to do with it."
+                " Please update your attention mask"
+            )
+            pad_size = (0, 0, 0, mask.shape[-1] - x.shape[-2], 0, 0)
+            return torch.nn.functional.pad(x, pad_size, mode="constant", value=0.0)
+        return x

.venv/lib/python3.11/site-packages/xformers/components/attention/compositional.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# Credits: this is heavily inspired by the official implementation, present in
+# https://github.com/sarthmit/Compositional-Attention
+# Original author: Sarthak Mittal
+# This is a simplified version, for the sake of clarity, and because some features could be exposed later
+# via the library directly.
+# In particular, code paths for TPUs, quantization and gumbel softmax have been removed
+# We're also following the same dimension ordering as in the rest of the xformers library
+# which is to say [Batch, Sequence, Embedding] wherever possible
+import math
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from xformers.components.attention import (
+    Attention,
+    AttentionConfig,
+    AttentionMask,
+    register_attention,
+)
+from xformers.components.attention.core import _softmax
+from xformers.components.input_projection import InputProjection, InputProjectionConfig
+def _either_or(a: Optional[int], b: int) -> int:
+    return a if a is not None else b
+@dataclass
+class CompositionalAttentionConfig(AttentionConfig):
+    dim_model: int
+    num_heads: int
+    dim_attn: Optional[int] = None
+    num_rules: Optional[int] = None
+    dim_key: Optional[int] = None
+    dim_value: Optional[int] = None
+    dim_selection: Optional[int] = None
+    dropout: float
+    qk_rule: bool = False
+    nonlinear: bool = False
+    q_compose: bool = False
+    bias: bool = True
+    causal: Optional[bool] = False
+    in_proj_container: Optional[InputProjection] = None
+    use_separate_proj_weight: Optional[bool] = False
+@register_attention("compositional", CompositionalAttentionConfig)
+class CompositionalAttention(Attention):
+    """Compositional Attention, as proposed in
+    "Compositional Attention: Disentangling search and retrieval"_, S. Mittal et al.
+    A key insight from this proposal is that the attention mechanism can be conceived as two steps:
+    a search and a retrieval operation. When queried, the model can search for the most relevant information
+    (Softmax(QKt)), then retrieve information given the Value.
+    Contrary to the original attention proposal, which does not consider interactions in between heads,
+    the compositional attention will consider all possible interactions and softmax over that dimension,
+    so that the information retrieved covers the most relevant dimensions. The number of heads and rules to
+    use is thus typically smaller than for a comparable traditional Transformer, and asking for the same number of heads
+    may not fit in memory.
+    Args:
+        dim_model: dimension of the incoming latent space
+        num_heads: number of heads *for the search operation*
+        dim_attn: dimension (embedding) of the attention
+        num_rules: number of rules to consider *for the retrieval operation*
+        dim_selection: dimension of the scoring/selection space for the retrievals
+        dim_key, dim_value: dimensions of K and V, if different from Q
+        dropout: attention dropout probability
+        qk_rule: QK product will drive the retrieval process
+        nonlinear: use a non linear method to score the retrievals
+        bias: use bias in the initial projection step
+        causal: causal computations (attend to the past only)
+    _"Compositional Attention: Disentangling search and retrieval": https://arxiv.org/pdf/2110.09419v1.pdf
+    """
+    def __init__(
+        self,
+        dim_model: int,
+        num_heads: int,
+        dim_attn: Optional[int] = None,
+        num_rules: Optional[int] = None,
+        dim_selection: Optional[int] = None,
+        dim_key: Optional[int] = None,
+        dim_value: Optional[int] = None,
+        dropout=0.0,
+        qk_rule=False,
+        nonlinear=False,
+        q_compose=False,
+        in_proj_container: Optional[InputProjection] = None,
+        use_separate_proj_weight: Optional[bool] = False,
+        bias=True,
+        causal=False,
+        *_,
+        **__,
+    ):
+        super().__init__()
+        # Define the inherited flags
+        self.requires_skip_multi_head = (
+            True  # This attention owns the multi-head mechanism
+        )
+        # Handle defaults / undefined values
+        self.dim_model = dim_model
+        num_rules = _either_or(num_rules, num_heads)
+        dim_selection = _either_or(dim_selection, dim_model // num_heads)
+        # All the initial definition plumbing
+        dim_attn = _either_or(dim_attn, dim_model)
+        dim_key = _either_or(dim_key, dim_model)
+        dim_value = _either_or(dim_value, dim_model)
+        self.in_proj_container = (
+            in_proj_container
+            if in_proj_container is not None
+            else InputProjection(
+                query_proj_params=InputProjectionConfig(dim_model, dim_key, bias=bias),
+                key_proj_params=InputProjectionConfig(dim_model, dim_key, bias=bias)
+                if use_separate_proj_weight
+                else None,
+                value_proj_params=InputProjectionConfig(dim_model, dim_value, bias=bias)
+                if use_separate_proj_weight
+                else None,
+            )
+        )
+        self.num_heads = num_heads
+        self.num_rules = num_rules
+        self.qk_rule = qk_rule
+        self.dim_selection = dim_selection
+        self.nonlinear = nonlinear
+        self.q_compose = q_compose
+        self.dropout_module = nn.Dropout(dropout)
+        self.dim_head = dim_model // num_heads
+        self.value_dim = dim_attn // num_rules
+        assert (
+            self.value_dim * num_rules == dim_attn
+        ), "value_dim must be divisible by num_rules"
+        self.scaling = self.dim_head**-0.5
+        self.scaling_values = self.dim_selection**-0.5
+        self.out_proj = nn.Linear(self.num_heads * self.value_dim, dim_model, bias=bias)
+        if self.qk_rule:
+            self.value_k = nn.Linear(self.value_dim, self.dim_selection, bias=bias)
+            if self.q_compose:
+                self.value_q = nn.Linear(self.dim_head, self.dim_selection, bias=bias)
+            else:
+                self.value_q = nn.Linear(
+                    dim_model, self.dim_selection * self.num_heads, bias=bias
+                )
+        else:
+            if self.q_compose:
+                self.value_q = nn.Linear(self.dim_head, self.dim_selection, bias=bias)
+            else:
+                self.value_q = nn.Linear(
+                    dim_model, self.dim_selection * self.num_heads, bias=bias
+                )
+            if self.nonlinear:
+                self.score_network: nn.Module = nn.Sequential(
+                    nn.Linear(
+                        self.dim_selection + self.value_dim,
+                        self.dim_selection,
+                        bias=bias,
+                    ),
+                    nn.ReLU(),
+                    nn.Linear(self.dim_selection, 1, bias=bias),
+                )
+            else:
+                self.score_network = nn.Linear(
+                    self.dim_selection + self.value_dim, 1, bias=bias
+                )
+        self.causal = causal
+        # Properties specific to this attention mechanism
+        self.supports_attention_mask = True
+        self.supports_key_padding_mask = False
+        self._reset_parameters()
+    def _reset_parameters(self):
+        # NOTE: in_proj_container is already initialized
+        if self.qk_rule:
+            nn.init.xavier_uniform_(self.value_k.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.value_q.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.value_q.weight)
+            if self.nonlinear:
+                nn.init.xavier_uniform_(self.score_network[0].weight)
+                nn.init.xavier_uniform_(self.score_network[2].weight)
+            else:
+                nn.init.xavier_uniform_(self.score_network.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+    def forward(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        att_mask: Optional[Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> Tensor:
+        """
+        Input shape: Time x Batch x Channel
+        Args:
+            att_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+        """
+        B, Sq, E = q.shape
+        _, Sk, _ = k.shape
+        assert E == self.dim_model
+        # First define projected query/key/values
+        # We keep the projected and original tensors in flight,
+        # depending on the options the original values could be reused
+        q_unprojected = q
+        q, k, v = self.in_proj_container(query=q, key=k, value=v)
+        q *= self.scaling
+        # Init causal mask if needed, now that we know the context length
+        if self.causal and (
+            self._causal_mask is None or self._causal_mask.shape[0] != Sk
+        ):
+            self._causal_mask = AttentionMask.make_causal(Sq, Sq, device=q.device)
+        # Convenience, create an attention mask if a tensor was passed
+        # This sanitizes different mask types being passed, from now on it's additive
+        if isinstance(att_mask, torch.Tensor):
+            # By default we don't know of the causality, and a check would be expensive
+            att_mask_additive: Optional[AttentionMask] = (
+                AttentionMask.from_bool(att_mask)
+                if att_mask.dtype == torch.bool
+                else AttentionMask(att_mask, is_causal=False)
+            )
+        else:
+            att_mask_additive = None
+        # Handle the attention and key padding masks
+        if self._causal_mask is not None:
+            # Optionally add the causal mask
+            if att_mask_additive is not None:
+                att_mask_additive += self._causal_mask
+            else:
+                att_mask_additive = self._causal_mask
+        # Flatten the heads or the rules
+        q = (
+            q.view(B, Sq, self.num_heads, self.dim_head)
+            .movedim(2, 1)
+            .flatten(0, 1)  # [B * num_heads, Sq, dim_head]
+        )
+        k = (
+            k.view(B, Sk, self.num_heads, self.dim_head).movedim(2, 1).flatten(0, 1)
+        )  # [B * num_heads, Sk, dim_head]
+        v = v.view(B, -1, self.num_rules, self.value_dim).movedim(2, 1).flatten(0, 1)
+        # Compute the search: Softmax(QKt)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))  # [B * self.num_heads, Sq, Sk]
+        if att_mask_additive is not None:
+            attn_weights += att_mask_additive.values
+        attn_weights = _softmax(attn_weights, causal=self.causal)
+        attn_weights = attn_weights.view(B, self.num_heads, Sq, Sk)
+        attn_probs = self.dropout_module(attn_weights)
+        # Now compute the information retrieval
+        # keep all the heads in flight, we'll score the different possibilities
+        # - compute all the possible retrievals
+        v = v.view(B, 1, self.num_rules, Sk, self.value_dim)
+        attn_probs = attn_probs.unsqueeze(2)
+        attn = torch.matmul(attn_probs, v).view(
+            B, self.num_heads, self.num_rules, Sq, self.value_dim
+        )
+        attn = attn.movedim(3, 1)  # [B, Sq, H, Rules, Values]
+        # - search the most appropriate retrieval among all the values
+        if self.q_compose:
+            v_q = self.value_q(q.transpose(0, 1)).view(
+                B, Sq, self.num_heads, 1, self.dim_selection
+            )
+        else:
+            v_q = self.value_q(q_unprojected).view(
+                B, Sq, self.num_heads, 1, self.dim_selection
+            )
+        if self.qk_rule:
+            v_q *= self.scaling_values
+            v_k = (
+                self.value_k(attn)
+                .view(B, Sq, self.num_heads, self.num_rules, self.dim_selection)
+                .transpose(4, 3)
+                .contiguous()
+            )
+            v_score = torch.matmul(v_q, v_k).view(
+                B, Sq, self.num_heads, self.num_rules, 1
+            )
+        else:
+            v_q = v_q.expand(-1, -1, -1, self.num_rules, -1)
+            v_in = torch.cat([attn, v_q], dim=-1)
+            v_score = self.score_network(v_in).view(
+                B, Sq, self.num_heads, self.num_rules, 1
+            )
+        v_score = F.softmax(v_score, dim=3)
+        # - extracted values are the original attention (inc. all the values) weighted by value score
+        attn = (attn * v_score).sum(dim=3).view(B, Sq, self.num_heads * self.value_dim)
+        # Final attention projection, same as other mechanisms
+        attn = self.out_proj(attn)
+        return attn

.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from enum import Enum
+from .base import FeatureMap, FeatureMapConfig
+from .softmax import NormDistribution, SMHyperbolic, SMOrf, SMReg
+class FeatureMapType(str, Enum):
+    SMOrf = "sm_orf"
+    SMHyp = "sm_hyp"
+    SMReg = "sm_reg"  # regularized softmax kernel
+__all__ = [
+    "SMOrf",
+    "SMReg",
+    "SMHyperbolic",
+    "NormDistribution",
+    "FeatureMapConfig",
+    "FeatureMap",
+]

.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (865 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (3.02 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/__pycache__/softmax.cpython-311.pyc ADDED Viewed

Binary file (11.7 kB). View file

.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/base.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import abstractmethod
+from dataclasses import asdict, dataclass
+from typing import Optional, Type, TypeVar
+import torch
+"""
+Feature maps allow for a given query or key to be encoded in a different space.
+"""
+Self = TypeVar("Self", bound="FeatureMap")
+@dataclass
+class FeatureMapConfig:
+    name: str
+    dim_features: int
+    iter_before_redraw: Optional[int]
+    normalize_inputs: Optional[bool]
+    epsilon: Optional[float]
+class FeatureMap(torch.nn.Module):
+    def __init__(
+        self,
+        dim_features: int,
+        iter_before_redraw: Optional[int] = None,
+        normalize_inputs: bool = False,
+        epsilon: float = 1e-6,
+    ):
+        super().__init__()
+        self.dim_features = dim_features
+        self.dim_feature_map = dim_features
+        self.iter_before_redraw = iter_before_redraw
+        self.features: Optional[torch.Tensor] = None
+        self.epsilon = epsilon
+        self.normalize_inputs = normalize_inputs
+        self._iter_counter = 0
+    @abstractmethod
+    def _get_feature_map(self, dim_input: int, dim_features: int, device: torch.device):
+        raise NotImplementedError()
+    @classmethod
+    def from_config(cls: Type[Self], config: FeatureMapConfig) -> Self:
+        # Generate the class inputs from the config
+        fields = asdict(config)
+        # Skip all Nones so that default values are used
+        fields = {k: v for k, v in fields.items() if v is not None}
+        return cls(**fields)

.venv/lib/python3.11/site-packages/xformers/components/attention/feature_maps/softmax.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from enum import Enum, auto
+from typing import Optional
+import torch
+from torch.autograd.profiler import record_function
+from .base import FeatureMap
+"""
+A set of feature maps which approximate the softmax kernel, as per the Performers_ paper.
+_Performers: "Rethinking attention with performers." K. Choromanski et al. (2020).
+    https://arxiv.org/pdf/2009.14794v1.pdf
+"""
+class NormDistribution(Enum):
+    Xi = auto()
+    Uniform = auto()
+class SoftMaxPositiveEstimators(FeatureMap):
+    def __init__(
+        self,
+        dim_features: int,
+        iter_before_redraw: Optional[int],
+        normalize_inputs: bool = False,
+        epsilon: float = 1e-6,
+        softmax_temp: float = -1,
+    ):
+        super().__init__(dim_features, iter_before_redraw, normalize_inputs, epsilon)
+        self.softmax_temp = softmax_temp
+        # Handle the scaling from all kernels by √m.
+        # This normalizes for all the feature maps involved
+        self.h_scale = math.log(math.sqrt(self.dim_features))
+    def pre_scale(self, x: torch.Tensor) -> torch.Tensor:
+        with record_function("feature_map::pre_scale"):
+            # Re-draw counting logic
+            if (
+                (
+                    self.iter_before_redraw is not None
+                    and self._iter_counter > self.iter_before_redraw
+                )
+                or self.features is None
+                or self.features.device != x.device
+            ):
+                # The feature map is actually using half the dimension, we'll concatenate + and - features
+                self._iter_counter = 1
+                self.features = self._get_feature_map(
+                    x.shape[-1], self.dim_feature_map, x.device
+                )
+            features = self.features
+            assert features is not None
+            if features.dtype != x.dtype:
+                self.features = features.to(x.dtype)
+            self._iter_counter += 1
+            # Normalization / softmax
+            if self.softmax_temp < 0:
+                # A = exp(QK.t/√d), so each input will be scaled by √√d
+                self.softmax_temp = x.shape[-1] ** -0.25
+            x_scaled = x * self.softmax_temp
+            # Compute the scaling factors in logspace, applied from within the exponential
+            # - dimnish possible exponential overflow
+            # - remove a multiply across the batch, replace by an addition
+            norm_x_2 = torch.einsum("...d,...d->...", x_scaled, x_scaled).unsqueeze(-1)
+            self.offset = -0.5 * norm_x_2 - self.h_scale + self.epsilon
+            if self.normalize_inputs:
+                # L0 normalize the exponential term, can be useful for numerical stability
+                # This ensures that features +- offset is below 1
+                self.offset -= norm_x_2.max(1, keepdim=True)[0]
+        # Return the scaled inputs, the rest depends on the kernel being used
+        return x_scaled
+    @staticmethod
+    @torch.no_grad()
+    def _get_random_ortho_matrix(
+        blocks: int,
+        dim: int,
+        device: torch.device,
+        norm_distribution: NormDistribution = NormDistribution.Uniform,
+    ) -> torch.Tensor:
+        r"""
+        Generate a random matrix whose rows are exactly orthonormal
+        "How to generate random matrices from the classical compact groups", Mezzadri, 2007
+        https://arxiv.org/pdf/math-ph/0609050v2.pdf
+        .. note: the typical qr decomposition does not give uniform results, qr decomposition is not
+        unique and the qr decomposition routines are biased towards numerical stability. See the above
+        paper for more information.
+        .. note: this does not follow the original implementation from the Performers authors.
+        see docs/assets/kde plots to visualize the impact of using the R signs to correct Q
+        """
+        H = torch.randn((blocks, dim, dim), device=device, requires_grad=False)
+        # Randomly scale the norms of the features, Xi distributed
+        if norm_distribution == NormDistribution.Xi:
+            # NOTE: This averages to sqrt(d)
+            norms = torch.sqrt(torch.einsum("...d,...d->...", H, H))
+        Q, R = torch.linalg.qr(H)
+        Q = torch.diag_embed(torch.sign(torch.diagonal(R, dim1=1, dim2=2))) @ Q
+        # Normalize if need be. Uniform NormDistribution does nothing, Q is already orthonormal
+        if norm_distribution == NormDistribution.Xi:
+            return torch.diag_embed(norms) @ Q
+        return Q
+class SMOrf(SoftMaxPositiveEstimators):
+    """
+    "Positive random orthogonal features" softmax estimator,
+    SM_ort^m+, as proposed in the Performers_ paper, Lemma 1.
+    _Performers: "Rethinking attention with performers." K. Choromanski et al. (2020).
+    https://arxiv.org/pdf/2009.14794v1.pdf
+    """
+    @torch.no_grad()
+    def _get_feature_map(self, dim_input: int, dim_features: int, device: torch.device):
+        """
+        Generate the projection matrix onto the random features
+        .. note: The heads dimension needs to be taken into account, hence the per-block random matrix
+        and not uniformally random.
+        """
+        # Get per block random unitary matrices.
+        # We need enough of them to project the whole input dimension, regardless of the
+        # requested dimension of the features
+        features = self._get_random_ortho_matrix(
+            math.ceil(dim_input / dim_features),
+            dim_features,
+            norm_distribution=NormDistribution.Xi,
+            device=device,
+        )
+        return features.flatten(0, 1)[:dim_input]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Softmax-dimension related scaling, shared for all kernels
+        x_scaled = super().pre_scale(x)
+        assert self.features is not None
+        # Project onto the random feature map.
+        x_scaled = x_scaled @ self.features
+        return torch.exp(x_scaled + self.offset)
+class SMHyperbolic(SoftMaxPositiveEstimators):
+    """
+    "Positive random features hyperbolic" estimator, SMHyp+,
+    as proposed in the Performers_ paper, Lemma 1.
+    _Performers: "Rethinking attention with performers." K. Choromanski et al. (2020).
+    https://arxiv.org/pdf/2009.14794v1.pdf
+    """
+    def __init__(
+        self,
+        dim_features: int,
+        iter_before_redraw: Optional[int],
+        normalize_inputs: bool = False,
+        epsilon: float = 1e-6,
+        softmax_temp: float = -1,
+    ):
+        super().__init__(
+            dim_features, iter_before_redraw, normalize_inputs, epsilon, softmax_temp
+        )
+        assert (
+            dim_features % 2 == 0
+        ), "The feature dimension needs to be even with this kernel"
+        self.dim_feature_map = self.dim_features // 2
+    @torch.no_grad()
+    def _get_feature_map(self, dim_input: int, dim_features: int, device: torch.device):
+        """
+        Generate the projection matrix onto the random features
+        .. note: The heads dimension needs to be taken into account, hence the per-block random matrix
+        and not uniformally random.
+        """
+        # Get per block random unitary matrices.
+        # We need enough of them to project the whole input dimension, regardless of the
+        # requested dimension of the features
+        features = self._get_random_ortho_matrix(
+            math.ceil(dim_input / dim_features),
+            dim_features,
+            norm_distribution=NormDistribution.Xi,
+            device=device,
+        )
+        return features.flatten(0, 1)[:dim_input]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Softmax-dimension related scaling, shared for all kernels
+        x_scaled = super().pre_scale(x)
+        # Project onto the random feature map, concatenate both + and - results
+        # This follows Lemma 1 in the original Performers Paper to best approximate a
+        # softmax kernel (cosh representation)
+        x_scaled = x_scaled @ self.features
+        return torch.cat(
+            [torch.exp(x_scaled + self.offset), torch.exp(-x_scaled + self.offset)],
+            dim=-1,
+        )
+class SMReg(SoftMaxPositiveEstimators):
+    """
+    "Regularized softmax kernel" estimator, SMREG+, as proposed in the Performers_ paper.
+    _Performers: "Rethinking attention with performers." K. Choromanski et al. (2020).
+    https://arxiv.org/pdf/2009.14794v1.pdf
+    """
+    def __init__(
+        self,
+        dim_features: int,
+        iter_before_redraw: Optional[int],
+        normalize_inputs: bool = False,
+        epsilon: float = 1e-6,
+        softmax_temp: float = -1,
+    ):
+        super().__init__(
+            dim_features, iter_before_redraw, normalize_inputs, epsilon, softmax_temp
+        )
+        assert (
+            dim_features % 2 == 0
+        ), "The feature dimension needs to be even with this kernel"
+        self.dim_feature_map = self.dim_features // 2
+    @torch.no_grad()
+    def _get_feature_map(self, dim_input: int, dim_features: int, device: torch.device):
+        """
+        Generate the projection matrix onto the random features
+        .. note: The heads dimension needs to be taken into account, hence the per-block random matrix
+        and not uniformally random.
+        """
+        # Get per block random unitary matrices.
+        # We need enough of them to project the whole input dimension, regardless of the
+        # requested dimension of the features
+        features = self._get_random_ortho_matrix(
+            math.ceil(dim_input / dim_features),
+            dim_features,
+            norm_distribution=NormDistribution.Uniform,
+            device=device,
+        ).flatten(0, 1)
+        norms = math.sqrt(dim_input) * torch.ones(features.shape[0], device=device)
+        return (torch.diag(norms) @ features)[:dim_input]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Softmax-dimension related scaling, shared for all kernels
+        x_scaled = super().pre_scale(x)
+        # Project onto the random feature map, concatenate both + and - results
+        # This follows Lemma 1 in the original Performers Paper to best approximate a
+        # softmax kernel (cosh representation + sample regularization)
+        x_scaled = x_scaled @ self.features
+        return torch.cat(
+            [torch.exp(x_scaled + self.offset), torch.exp(-x_scaled + self.offset)],
+            dim=-1,
+        )

.venv/lib/python3.11/site-packages/xformers/components/attention/global_tokens.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+import torch.nn as nn
+from xformers.components.attention import (
+    Attention,
+    AttentionConfig,
+    AttentionMask,
+    maybe_sparsify,
+    register_attention,
+    sparsify,
+)
+from xformers.components.attention.attention_patterns import (
+    causal_1d_pattern,
+    global_token_pattern,
+)
+from xformers.components.attention.core import scaled_dot_product_attention
+@dataclass
+class GlobalAttentionConfig(AttentionConfig):
+    attention_query_mask: torch.Tensor  # Mark the queries which have global attention
+    causal: Optional[bool]
+    force_sparsity: Optional[bool]
+@register_attention("global", GlobalAttentionConfig)
+class GlobalAttention(Attention):
+    def __init__(
+        self,
+        dropout: float,
+        attention_query_mask: torch.Tensor,
+        causal: bool = False,
+        force_sparsity: bool = False,
+        *_,
+        **__,
+    ):
+        r"""
+        Global attention, as proposed for instance in BigBird_ or Longformer_.
+        Global means in that case that the queries positively labelled in the ```attention_query_mask``` can attend
+        to all the other queries. The queries negatively labelled in the ```attention_query_mask``` cannot attend to
+        any other query.
+        This implementation is sparse-aware, meaning that the empty attention parts will not be represented in memory.
+        Args:
+            dropout (float): probability of an element to be zeroed
+            attention_query_mask (torch.Tensor): if true, this query can attend to all the others
+        """
+        super().__init__()
+        assert attention_query_mask.dtype == torch.bool, "A boolean mask is expected"
+        assert (
+            attention_query_mask.shape[1] == 1
+            and attention_query_mask.shape[0] > attention_query_mask.shape[1]
+        ), "A N x 1 query mask is expected"
+        self.attn_drop = nn.Dropout(dropout, inplace=False)
+        self.attention_mask = global_token_pattern(attention_query_mask[:, 0])
+        self.force_sparsity = force_sparsity
+        if causal:
+            self.attention_mask &= causal_1d_pattern(attention_query_mask.shape[1])
+        self.attention_mask = (
+            sparsify(self.attention_mask)
+            if self.force_sparsity
+            else maybe_sparsify(self.attention_mask)
+        )
+        # Properties specific to this attention mechanism
+        self.requires_same_k_q_dimensions = True
+        self.supports_attention_mask = False
+        self.supports_key_padding_mask = False
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        att_mask: Optional[Union[torch.Tensor, AttentionMask]] = None,
+        *_,
+        **__,
+    ):
+        # Make sure that the mask is on the right device
+        if self.attention_mask.device != q.device:
+            self.attention_mask = self.attention_mask.to(q.device)
+        # Mask-aware attention
+        if att_mask is not None:
+            if att_mask.dtype == torch.bool and isinstance(
+                self.attention_mask, AttentionMask
+            ):
+                if not isinstance(att_mask, AttentionMask):
+                    att_mask = AttentionMask.from_bool(att_mask)
+                mask = self.attention_mask + att_mask
+            else:
+                mask = self.attention_mask & att_mask
+        else:
+            mask = self.attention_mask
+        # Handle q/k/v which would not fit the mask
+        seq_len = q.shape[-2]
+        q_, k_, v_ = map(lambda x: self._maybe_pad_sequence(x, mask), (q, k, v))
+        # Normal attention with the global tokens mask
+        att = scaled_dot_product_attention(
+            q=q_, k=k_, v=v_, att_mask=mask, dropout=self.attn_drop
+        )
+        # Take into account an hypothetical padding
+        return att[:, :seq_len, :]

.venv/lib/python3.11/site-packages/xformers/components/attention/linformer.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from xformers.components.attention import Attention, AttentionConfig, register_attention
+from xformers.components.attention.core import scaled_dot_product_attention
+@dataclass
+class LinformerSelfAttentionConfig(AttentionConfig):
+    seq_len: int  # dimension of the input sequence
+    k: Optional[int]  # dimension of the internal space
+@register_attention("linformer", LinformerSelfAttentionConfig)
+class LinformerAttention(Attention):
+    def __init__(
+        self, dropout: float, seq_len: int, k: Optional[int] = None, *args, **kwargs
+    ):
+        """
+        Linformer attention mechanism,
+        from `Linformer: Self-Attention with Linear Complexity`_, Wang et al (2020).
+        The original notation is kept as is.
+        .. _`Linformer: Self-Attention with Linear Complexity` : https://arxiv.org/abs/2006.04768v2
+        """
+        super().__init__()
+        if k is None:
+            k = seq_len // 4
+        self.k = k
+        self.E = nn.Linear(seq_len, k, bias=False)
+        self.F = nn.Linear(seq_len, k, bias=False)
+        self.attn_drop = nn.Dropout(dropout, inplace=False)
+        self.seq_len = seq_len
+        # MHA related flags:
+        # kq need to have the same dimension
+        self.requires_same_k_q_dimensions = True
+        # This attention does not support attention masks
+        self.supports_attention_mask = False
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, *args, **kwargs
+    ):
+        # Handle a smaller dimension than expected
+        padding = 0
+        if q.shape[1] < self.seq_len:
+            padding = self.seq_len - q.shape[1]
+            pad_dims = (0, 0, 0, padding)
+            q = torch.nn.functional.pad(q, pad_dims)
+            k = torch.nn.functional.pad(k, pad_dims)
+            v = torch.nn.functional.pad(v, pad_dims)
+        k_projected = self.E(k.transpose(-2, -1)).transpose(-2, -1)
+        v_projected = self.F(v.transpose(-2, -1)).transpose(-2, -1)
+        y = scaled_dot_product_attention(
+            q=q, k=k_projected, v=v_projected, att_mask=None, dropout=self.attn_drop
+        )
+        y = self.attn_drop(y)
+        return y[:, :-padding, :] if padding > 0 else y

.venv/lib/python3.11/site-packages/xformers/components/attention/ortho.py ADDED Viewed

	@@ -0,0 +1,324 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Union
+import torch
+import torch.autograd.profiler as profiler
+import torch.nn as nn
+import torch.nn.functional as Fn
+from xformers.components.attention import (
+    Attention,
+    AttentionConfig,
+    AttentionMask,
+    register_attention,
+)
+from xformers.components.attention.core import (
+    scaled_dot_product_attention,
+    scaled_query_key_softmax,
+)
+logger = logging.getLogger("xformers")
+class LandmarkSelection(str, Enum):
+    Orthogonal = "orthogonal"
+    KMeans = "kmeans"
+    KMeans_Spherical = "kmeans_spherical"
+    Random = "random"
+@dataclass
+class OrthoformerAttentionConfig(AttentionConfig):
+    """
+    num_landmarks           Number of landmarks to use for softmax approximation.
+    subsample_fraction      Percentage of q_samples matrix to sample per iteration
+    landmark_selection      Landmark selection strategy
+    """
+    num_landmarks: Optional[int]
+    subsample_fraction: Optional[float]
+    landmark_selection: Optional[LandmarkSelection]
+@register_attention("orthoformer", OrthoformerAttentionConfig)
+class OrthoFormerAttention(Attention):
+    def __init__(
+        self,
+        dropout: float,
+        num_landmarks: int = 32,
+        subsample_fraction: float = 1.0,
+        landmark_selection: LandmarkSelection = LandmarkSelection.Orthogonal,
+        *args,
+        **kwargs,
+    ):
+        """
+        Orthoformer_ attention mechanism.
+        ::
+            "Keeping Your Eye on the Ball: Trajectory Attention in Video Transformers"
+            Patrick, M., Campbell, D., Asano, Y., Misra, I., Metze, F., Feichtenhofer,
+            C., Vedaldi, A., Henriques, J. (2021)
+            Reference codebase: https://github.com/facebookresearch/Motionformer
+        .. _Orthoformer: https://arxiv.org/abs/2106.05392
+        """
+        super().__init__()
+        self.num_landmarks = num_landmarks
+        self.attn_drop = nn.Dropout(dropout)
+        self.subsample_fraction = subsample_fraction
+        self.landmark_selection = landmark_selection
+        # Properties specific to this attention mechanism
+        self.supports_attention_mask = True
+        self.supports_key_padding_mask = False
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        att_mask: Optional[Union[AttentionMask, torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ):
+        N = k.shape[1]
+        if self.num_landmarks == N:
+            #  Default attention
+            x = scaled_dot_product_attention(q, k, v, att_mask)
+        else:
+            with torch.no_grad(), profiler.record_function("select landmarks"):
+                if self.landmark_selection == LandmarkSelection.Orthogonal:
+                    landmarks = self._compute_orthogonal_landmarks(q)
+                elif self.landmark_selection == LandmarkSelection.Random:
+                    half_L = self.num_landmarks // 2
+                    landmarks_q = q[:, torch.randint(q.size(1), (half_L,)), :]
+                    landmarks_k = k[:, torch.randint(k.size(1), (half_L,)), :]
+                    landmarks = torch.cat((landmarks_q, landmarks_k), dim=-2)
+                elif self.landmark_selection == LandmarkSelection.KMeans:
+                    landmarks = self._cluster_landmarks(q)
+                elif self.landmark_selection == LandmarkSelection.KMeans_Spherical:
+                    landmarks = self._cluster_landmarks(q, spherical=True)
+            if att_mask is not None:
+                logger.warning(
+                    "Orthoformer: attention mask passed alongside with using landmarks to reduce dimensions. \
+                    The two are typically not compatible"
+                )
+                # FIXME: Should we still accept a mask in that case ?
+                att_mask = None
+            # pyre-ignore[61]: TODO(T103337542): `landmarks` mistakenly seems
+            # like it could be uninitialized.
+            kernel_1 = scaled_query_key_softmax(q, landmarks, att_mask)
+            # pyre-ignore[61]: TODO(T103337542): `landmarks` mistakenly seems
+            # like it could be uninitialized.
+            kernel_2 = scaled_query_key_softmax(landmarks, k, att_mask)
+            x = torch.matmul(kernel_1, torch.matmul(kernel_2, v))
+        x = self.attn_drop(x)
+        return x
+    def _cluster_landmarks(
+        self,
+        q: torch.Tensor,
+        spherical: bool = False,
+        num_iters: int = 6,
+    ) -> torch.Tensor:
+        """
+        Construct set of landmarks by recursively selecting new landmarks
+        that are maximally orthogonal to the existing set.
+        Returns near orthogonal landmarks with shape (B, M, D).
+        """
+        num_landmarks = min(self.num_landmarks, q.shape[1])
+        if self.subsample_fraction < 1.0:
+            num_samples = max(
+                int(self.subsample_fraction * q.size(-2)), num_landmarks
+            )  # Need at least M/2 samples of queries and keys
+            q_samples = q[:, torch.randint(q.size(-2), (num_samples,)), :]  # (B, N, D)
+        else:
+            q_samples = q  # (B, N, D)
+        if spherical:
+            q_samples_normalized = Fn.normalize(
+                q_samples, p=2, dim=-1
+            )  # may need to change default eps to eps=1e-8 for mixed precision compatibility
+            landmarks = self._kmeans_spherical(
+                q_samples_normalized, num_landmarks, num_iters
+            )
+        else:
+            landmarks = self._kmeans(q_samples, num_landmarks, num_iters)
+        return landmarks  # (B, M, D)
+    def _kmeans(self, x: torch.Tensor, K: int, num_iters: int = 10):
+        """
+        Arguments:
+            x: (B, N, D)
+            K: number of clusters
+            num_iters: the number of kmeans updates
+        """
+        B, N, D = x.size()
+        assert K <= N, f"{K} > {N}"
+        c = x[
+            :, torch.randperm(N, device=x.device)[:K], :
+        ].clone()  # initialisation for the centroids
+        with profiler.record_function("kmeans"):
+            x_i = x.view(B, N, 1, D)
+            c_j = c.view(B, 1, K, D)
+            counts = c.new_zeros(B, K)
+            ones = x.new_ones((B, N))
+            for _ in range(num_iters):
+                # E step: assign points to the nearest cluster
+                D_ij = ((x_i - c_j) ** 2).sum(-1)  # (B, N, K) squared distances
+                cl = D_ij.argmin(
+                    dim=-1, keepdim=True
+                ).long()  # (B, N, 1) index of point to nearest cluster
+                # M step: update the centroids
+                c.zero_()
+                c.scatter_add_(-2, cl.repeat(1, 1, D), x)  # sum of points per cluster
+                counts.fill_(1e-6)  # avoid div0
+                counts.scatter_add_(
+                    -1, cl.squeeze(-1), ones
+                )  # number of points per cluster
+                c.divide_(counts.unsqueeze(-1))  # compute the average
+        return c
+    def _kmeans_spherical(self, x: torch.Tensor, K: int, num_iters=10):
+        """
+        Arguments:
+            x: (B, N, D)
+        """
+        B, N, D = x.size()
+        assert K <= N, f"{K} > {N}"
+        # initialisation for the centroids
+        c = x[:, torch.randperm(N, device=x.device)[:K], :].clone()
+        with profiler.record_function("kmeans_spherical"):
+            counts = c.new_zeros(B, K)
+            ones = x.new_ones((B, N))
+            for _ in range(num_iters):
+                # E step: assign points to the nearest cluster
+                D_ij = torch.matmul(
+                    x, c.transpose(-2, -1)
+                )  # (B, N, K) cosine similarity
+                cl = D_ij.argmax(
+                    dim=-1, keepdim=True
+                ).long()  # (B, N, 1) index of point to nearest cluster
+                # M step: update the centroids
+                c.zero_()
+                c.scatter_add_(-2, cl.repeat(1, 1, D), x)  # sum of points per cluster
+                counts.fill_(1e-6)  # avoid div0
+                counts.scatter_add_(
+                    -1, cl.squeeze(-1), ones
+                )  # number of points per cluster
+                c.divide_(counts.unsqueeze(-1))  # compute the average
+                c = Fn.normalize(c, p=2, dim=-1)  # renormalise
+        return c
+    def _compute_orthogonal_landmarks(self, q: torch.Tensor) -> torch.Tensor:
+        """
+        Construct set of landmarks by recursively selecting new landmarks
+        that are maximally orthogonal to the existing set.
+        Returns near orthogonal landmarks with shape (B, M, D).
+        """
+        if self.subsample_fraction < 1.0:
+            # Need at least M samples of queries
+            num_samples = max(
+                int(self.subsample_fraction * q.size(-2)), self.num_landmarks
+            )
+            q_samples = q[
+                :, torch.randint(q.size(-2), (num_samples,), device=q.device), :
+            ]
+        else:
+            # (B, N, D)
+            q_samples = q
+        # may need to change default eps to eps=1e-8 for mixed precision compatibility
+        q_samples_normalized = Fn.normalize(q_samples, p=2, dim=-1)
+        B, N, D = q_samples_normalized.shape
+        selected_mask = torch.zeros((B, N, 1), device=q_samples_normalized.device)
+        landmark_mask = torch.ones(
+            (B, 1, 1), dtype=selected_mask.dtype, device=q_samples_normalized.device
+        )
+        #  Get initial random landmark
+        random_idx = torch.randint(
+            q_samples_normalized.size(-2), (B, 1, 1), device=q_samples_normalized.device
+        )
+        selected_mask.scatter_(-2, random_idx, landmark_mask)
+        #  Selected landmarks
+        selected_landmarks = torch.empty(
+            (B, self.num_landmarks, D),
+            device=q_samples_normalized.device,
+            dtype=q_samples_normalized.dtype,
+        )
+        selected_landmarks[:, 0, :] = q_samples_normalized[
+            torch.arange(q_samples_normalized.size(0)), random_idx.view(-1), :
+        ].view(B, D)
+        # Store computed cosine similarities
+        cos_sims = torch.empty(
+            (B, N, self.num_landmarks),
+            device=q_samples_normalized.device,
+            dtype=q_samples_normalized.dtype,
+        )
+        for M in range(1, self.num_landmarks):
+            with profiler.record_function("find new landmark"):
+                #  Calculate absolute cosine similarity between selected and unselected landmarks
+                # (B, N, D) * (B, D) -> (B, N)
+                cos_sims[:, :, M - 1] = torch.einsum(
+                    "b n d, b d -> b n",
+                    q_samples_normalized,
+                    selected_landmarks[:, M - 1, :],
+                ).abs()
+                # (B, N, M) cosine similarities of current set of landmarks wrt all queries and keys
+                cos_sim_set = cos_sims[:, :, :M]
+                #  Get orthogonal landmark: landmark with smallest absolute cosine similarity:
+                # set cosine similarity for already selected landmarks to > 1
+                cos_sim_set.view(-1, M)[selected_mask.flatten().bool(), :] = 10
+                # (B,) - want max for non
+                selected_landmark_idx = cos_sim_set.amax(-1).argmin(-1)
+                #  Add most orthogonal landmark to selected landmarks:
+                selected_landmarks[:, M, :] = q_samples_normalized[
+                    torch.arange(q_samples_normalized.size(0)), selected_landmark_idx, :
+                ].view(B, D)
+                #  Removed selected indices from non-selected mask:
+                selected_mask.scatter_(
+                    -2, selected_landmark_idx.unsqueeze(-1).unsqueeze(-1), landmark_mask
+                )
+        # (B, M, D)
+        landmarks = torch.masked_select(q_samples, selected_mask.bool()).reshape(
+            B, -1, D
+        )
+        return landmarks  # (B, M, D)

.venv/lib/python3.11/site-packages/xformers/components/attention/pooling.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from xformers.components.attention import Attention, AttentionConfig, register_attention
+@dataclass
+class PoolingAttentionConfig(AttentionConfig):
+    pool_size: int  # dimension of the input sequence
+    stride: Optional[int]  # dimension of the internal space
+    padding: Optional[int]
+@register_attention("pooling", PoolingAttentionConfig)
+class Pooling(Attention):
+    def __init__(
+        self,
+        pool_size: int = 3,
+        stride: int = 1,
+        padding: Optional[int] = None,
+        *_,
+        **__,
+    ):
+        """
+        Pooling token mixing mechanism, as proposed in
+        `Metaformer is actually what you need for vision`_, Yu et al (2021).
+        The original notation is kept as is.
+        .. _`Metaformer is actually what you need for vision` : https://arxiv.org/pdf/2111.11418v1.pdf
+        """
+        super().__init__()
+        padding = padding if padding is not None else pool_size // 2
+        self.pool = nn.AvgPool2d(
+            pool_size,
+            stride=stride,
+            padding=pool_size // 2,
+            count_include_pad=False,
+        )
+        # MHA related flags:
+        # kq need to have the same dimension
+        self.requires_same_k_q_dimensions = False
+        # This attention does not support attention masks
+        self.supports_attention_mask = False
+        # This "attention" (token mixing) skips the multihead attention altogether
+        self.requires_skip_multi_head = True
+        self.requires_input_projection = False
+        # This operator does not really handle q,k,v
+        self.requires_same_k_q_dimensions = True
+        # This attention requires the 2d structure out of the context,
+        # implictly assumed to be a squared length
+        self.requires_squared_context = True
+    def forward(self, q: torch.Tensor, *_, **__):
+        # Expose the 2D token structure
+        B, HW, C = q.shape
+        H = int(math.sqrt(HW))
+        assert H * H == HW
+        q = q.transpose(-2, -1).reshape(B, C, H, H)
+        # 2D pool
+        x_pool = self.pool(q) - q  # compensate for the residual path
+        # Get back to B HW C
+        return x_pool.flatten(2, 3).transpose(-2, -1)

.venv/lib/python3.11/site-packages/xformers/components/attention/sparsity_config.py ADDED Viewed

	@@ -0,0 +1,812 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+The code has been adopted from DeepSpeed
+(https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/sparsity_config.py)
+"""
+import random
+import torch
+class SparsityConfig:
+    """Abstract Configuration class to store `sparsity configuration of a self attention layer`.
+    It contains shared property of different block-sparse sparsity patterns. However, each class
+    needs to extend it based on required property and functionality.
+    """
+    def __init__(self, num_heads, block_size=16, different_layout_per_head=False):
+        """Initialize the Sparsity Pattern Config.
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block_size: optional: an integer determining the block size. Current implementation of
+             sparse self-attention is based on blocked sparse matrices. In which this parameter
+             defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be
+             assigned a different sparsity layout; default is false and this will be satisfied
+             based on availability.
+        """
+        self.num_heads = num_heads
+        self.block_size = block_size
+        self.different_layout_per_head = different_layout_per_head
+        self.num_layout_heads = num_heads if different_layout_per_head else 1
+    def setup_layout(self, seq_len):
+        """Create layout tensor for the given sequence length
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) for sparsity layout
+                of all head; initialized with zero
+        """
+        if seq_len % self.block_size != 0:
+            raise ValueError(
+                f"Sequence Length, {seq_len}, needs to be dividable by Block size {self.block_size}!"
+            )
+        num_blocks = seq_len // self.block_size
+        # TODO Currently we allocate layout per head; needs to be updated if heads share a single layout.
+        layout = torch.zeros(
+            (self.num_heads, num_blocks, num_blocks), dtype=torch.int64
+        )
+        return layout
+    def check_and_propagate_first_head_layout(self, layout):
+        """If all heads require same sparsity layout, it propagate first head layout to all heads
+        Arguments:
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+             sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+             layout of all head
+        """
+        if not self.different_layout_per_head:
+            layout[1 : self.num_heads, :, :] = layout[0, :, :]
+        return layout
+class DenseSparsityConfig(SparsityConfig):
+    """Configuration class to store `Dense` configuration.
+    In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and
+    comprehension.
+    """
+    def __init__(self, num_heads, block_size=16, different_layout_per_head=False):
+        """Initialize the Dense Sparsity Pattern Config.
+        In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison
+        and comprehension.
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block_size: optional: an integer determining the block size. Current implementation of
+             sparse self-attention is based on blocked sparse matrices. In which this parameter
+             defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: this is just for the sake of consistency with
+             other sparsity formats; can ignore it for DenseSparsityConfig
+        """
+        super().__init__(num_heads, block_size, different_layout_per_head)
+    def make_layout(self, seq_len):
+        """Set 1 to all blocks of the layout meanins the pattern is dense; not sparse.
+        Arguments:
+             seq_len: required: an integer determining the underling sequence length;
+             must be <= max sequence length
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+             layout of all head; for dense everything is 1
+        """
+        layout = self.setup_layout(seq_len)
+        layout[:, :, :] = 1
+        return layout
+class FixedSparsityConfig(SparsityConfig):
+    """Configuration class to store `Fixed` sparsity configuration.
+    For more details about this sparsity config, please see `Generative Modeling with
+    Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized.
+    This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity.
+    """
+    def __init__(
+        self,
+        num_heads,
+        block_size=16,
+        different_layout_per_head=False,
+        num_local_blocks=4,
+        num_global_blocks=1,
+        attention="bidirectional",
+        horizontal_global_attention=False,
+        num_different_global_patterns=1,
+    ):
+        """Initialize `Fixed` Sparsity Pattern Config.
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block_size: optional: an integer determining the block size. Current implementation of
+                sparse self-attention is based on blocked sparse matrices. In which this parameter
+                defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be
+                assigned a different sparsity layout; default is false and this will be satisfied
+                based on availability.
+             num_local_blocks: optional: an integer determining the number of blocks in local attention
+                window.
+             num_global_blocks: optional: an integer determining how many consecutive blocks in a local
+                window is used as the representative of the window for global attention.
+             attention: optional: a string determining attention type. Attention can be `unidirectional`,
+                such as autoregressive models, in which tokens attend only to tokens appear before them
+                in the context. Considering that, the upper triangular of attention matrix is empty as
+                above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to
+                any other tokens before or after them. Then, the upper triangular part of the attention
+                matrix is mirror of the lower triangular in the above figure.
+             horizontal_global_attention: optional: a boolean determining if blocks that are global
+                representative of a local window, also attend to all other blocks. This is valid only if
+                attention type is `bidirectional`. Looking at the attention matrix, that means global
+                attention not only includes the vertical blocks, but also horizontal blocks.
+             num_different_global_patterns: optional: an integer determining number of different global
+                attentions layouts. While global attention can be fixed by which block/s are representative
+                of any local window, since there are multi-heads, each head can use a different global representative.
+                For example, with 4 blocks local window and global attention size of 1 block, we can have 4 different
+                versions in which the first, Second, third, or forth block of each local window can be global
+                representative of that window. This parameter determines how many of such patterns we want.
+                Of course, there is a limitation based on num_local_blocks and num_global_blocks.
+        """
+        super().__init__(num_heads, block_size, different_layout_per_head)
+        self.num_local_blocks = num_local_blocks
+        if num_local_blocks % num_global_blocks != 0:
+            raise ValueError(
+                f"""Number of blocks in a local window, {num_local_blocks},
+                    must be dividable by number of global blocks, {num_global_blocks}!"""
+            )
+        self.num_global_blocks = num_global_blocks
+        if attention != "unidirectional" and attention != "bidirectional":
+            raise NotImplementedError(
+                'only "uni/bi-directional" attentions are supported for now!'
+            )
+        self.attention = attention
+        if attention != "bidirectional" and horizontal_global_attention:
+            raise ValueError(
+                'only "bi-directional" attentions can support horizontal global attention!'
+            )
+        self.horizontal_global_attention = horizontal_global_attention
+        if num_different_global_patterns > 1 and not different_layout_per_head:
+            raise ValueError(
+                """Number of different layouts cannot be more than one when you have set a single layout
+                for all heads! Set different_layout_per_head to True."""
+            )
+        if num_different_global_patterns > (num_local_blocks // num_global_blocks):
+            raise ValueError(
+                f"""Number of layout versions (num_different_global_patterns), {num_different_global_patterns},
+                cannot be larger than number of local window blocks divided by number of global blocks,
+                {num_local_blocks} / {num_global_blocks} = {num_local_blocks//num_global_blocks}!"""
+            )
+        self.num_different_global_patterns = num_different_global_patterns
+    def set_local_layout(self, h, layout):
+        """Sets local attention layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+                layout of all head in which local layout is set
+        """
+        num_blocks = layout.shape[1]
+        for i in range(0, num_blocks, self.num_local_blocks):
+            end = min(i + self.num_local_blocks, num_blocks)
+            for row in range(i, end):
+                for col in range(
+                    i, (row + 1 if self.attention == "unidirectional" else end)
+                ):
+                    layout[h, row, col] = 1
+        return layout
+    def set_global_layout(self, h, layout):
+        """Sets global attention layout used by the given head in the sparse attention.
+        Currently we set global blocks starting from the last block of a local window to the first one.
+        That means if a local window consists of 4 blocks and global attention size is one block, we use
+        block #4 in each local window as global. If we have different layout per head, then other heads
+        will get #3, #2, and #1. And if we have more heads (and different layout has set) than num of global
+        attentions, multiple head may have same global attentions.
+        Note) if horizontal_global_attention is set, global blocks will be set both horizontally and
+        vertically.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+                layout of all head in which global layout is set
+        """
+        num_blocks = layout.shape[1]
+        first_global_block_idx = (
+            self.num_local_blocks
+            - (1 + h % self.num_different_global_patterns) * self.num_global_blocks
+        )
+        # set all global blocks except the last one if (in last local window)
+        end = num_blocks - (num_blocks % self.num_local_blocks)
+        for i in range(first_global_block_idx, end, self.num_local_blocks):
+            # vertical global attention
+            first_row = 0 if self.attention == "bidirectional" else i
+            # (((i // self.num_local_blocks) + 1) * self.num_local_blocks)
+            # if (first_row < num_blocks):
+            layout[h, first_row:, i : i + self.num_global_blocks] = 1
+            # horizontal global attention; only in bidirectional attention
+            if self.horizontal_global_attention:
+                layout[h, i : i + self.num_global_blocks, :] = 1
+        # set last global blocks; handle possible short last local window
+        if end < num_blocks:
+            start = min(
+                end + first_global_block_idx, num_blocks - self.num_global_blocks
+            )
+            end = start + self.num_global_blocks
+            # vertical global attention
+            first_row = 0 if self.attention == "bidirectional" else start
+            # (((start // self.num_local_blocks) + 1) * self.num_local_blocks)
+            # if (first_row < num_blocks):
+            layout[h, first_row:, start:end] = 1
+            # horizontal global attention
+            if self.horizontal_global_attention:
+                layout[h, start:end, :] = 1
+        return layout
+    def make_layout(self, seq_len):
+        """Generates `Fixed` sparsity layout used by each head in the sparse attention.
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `Fixed`
+                sparsity layout of all head
+        """
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_local_layout(h, layout)
+            layout = self.set_global_layout(h, layout)
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout
+class VariableSparsityConfig(SparsityConfig):
+    """Configuration class to store `Variable` sparsity configuration.
+    This layout is an extension of FixedSparsityConfig in which:
+      - user can set random layout; default value is zero means no random block
+      - user can provide a list of local block sizes
+      - user can provide a list of global block indices.
+    For more details about `Fixed` sparsity config, please see `Generative Modeling with
+    Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized.
+    This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity.
+    """
+    def __init__(
+        self,
+        num_heads,
+        block_size=16,
+        different_layout_per_head=False,
+        num_random_blocks=0,
+        local_window_blocks=[4],
+        global_block_indices=[0],
+        global_block_end_indices=None,
+        attention="bidirectional",
+        horizontal_global_attention=False,
+    ):
+        """Initialize `Variable` Sparsity Pattern Config.
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block_size: optional: an integer determining the block size. Current implementation of sparse
+                self-attention is based on blocked sparse matrices. In which this parameter defines
+                size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be assigned a
+                different sparsity layout; default is false and this will be satisfied based on
+                availability. Currently this sparsity config can only assign single layout to all heads;
+                needs to be extended for different layout per head.
+             num_random_blocks: optional: an integer determining the number of random blocks in each block row.
+             local_window_blocks: optional: a list of integers determining the number of blocks in each
+                local attention window. It assumes first number determines # of blocks in the first local
+                window, second the second window, ..., and the last number determines the number of blocks
+                in the remaining local windows.
+             global_block_indices: optional: a list of integers determining which blocks are considered
+                as global attention. Given indices, determine the blocks that all other token blocks
+                attend to and they attend to all other token blocks. Default value is only index 0.
+                Notice that if global_block_end_indices parameter is set, this parameter is used as
+                starting index of each global window.
+             global_block_end_indices: optional: a list of integers determining end indices of global
+                window blocks. By default this is not used. But if it is set, it must have the same size
+                of global_block_indices parameter, and combining this two parameters, for each index i,
+                blocks from global_block_indices[i] to global_block_end_indices[i] (exclusive) are
+                considered as global attention.
+             attention: optional: a string determining attention type. Attention can be `unidirectional`,
+                such as autoregressive models, in which tokens attend only to tokens appear before them
+                in the context. Considering that, the upper triangular of attention matrix is empty as
+                above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to
+                any other tokens before or after them. Then, the upper triangular part of the attention
+                matrix is mirror of the lower triangular in the above figure.
+             horizontal_global_attention: optional: a boolean determining if blocks that are global
+                representative of a local window, also attend to all other blocks. This is valid only if
+                attention type is `bidirectional`. Looking at the attention matrix, that means global
+                attention not only includes the vertical blocks, but also horizontal blocks.
+        """
+        super().__init__(num_heads, block_size, different_layout_per_head)
+        self.num_random_blocks = num_random_blocks
+        self.local_window_blocks = local_window_blocks
+        self.global_block_indices = global_block_indices
+        if global_block_end_indices is not None:
+            if len(global_block_indices) != len(global_block_end_indices):
+                raise ValueError(
+                    f"""Global block start indices length, {len(global_block_indices)}, must be same as
+                    global block end indices length, {len(global_block_end_indices)}!"""
+                )
+            for _, (start_idx, end_idx) in enumerate(
+                zip(global_block_indices, global_block_end_indices)
+            ):
+                if start_idx >= end_idx:
+                    raise ValueError(
+                        f"""Global block start index, {start_idx}, must be smaller than global block end
+                        index, {end_idx}!"""
+                    )
+        self.global_block_end_indices = global_block_end_indices
+        if attention != "unidirectional" and attention != "bidirectional":
+            raise NotImplementedError(
+                'only "uni/bi-directional" attentions are supported for now!'
+            )
+        self.attention = attention
+        if attention != "bidirectional" and horizontal_global_attention:
+            raise ValueError(
+                'only "bi-directional" attentions can support horizontal global attention!'
+            )
+        self.horizontal_global_attention = horizontal_global_attention
+    def set_random_layout(self, h, layout):
+        """Sets random attention layout used by the given head in the sparse attention.
+        Note) By default, it assumes there will be a unique random block layout for all heads; unless
+        `different_layout_per_head` parameter is set in which each head can have a different random
+        layout.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+                layout of all head in which random layout is set
+        """
+        num_blocks = layout.shape[1]
+        if num_blocks < self.num_random_blocks:
+            raise ValueError(
+                f"""Number of random blocks, {self.num_random_blocks}, must be smaller than overall number
+                of blocks in a row, {num_blocks}!"""
+            )
+        for row in range(0, num_blocks):
+            rnd_cols = random.sample(range(0, num_blocks), self.num_random_blocks)
+            layout[h, row, rnd_cols] = 1
+        return layout
+    def set_local_layout(self, h, layout):
+        """Sets local attention layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+                layout of all head in which local layout is set
+        """
+        num_blocks = layout.shape[1]
+        start_block_idx = 0
+        end_block_idx = 0
+        for block_size in self.local_window_blocks:
+            end_block_idx += block_size
+            end_block_idx = min(end_block_idx, num_blocks)
+            for row in range(start_block_idx, end_block_idx):
+                for col in range(
+                    start_block_idx,
+                    (row + 1 if self.attention == "unidirectional" else end_block_idx),
+                ):
+                    layout[h, row, col] = 1
+            start_block_idx += block_size
+        # if there is any remaining not attended part, use the lats local window block size as local
+        # window for the remaining applicable local windows
+        for i in range(start_block_idx, num_blocks, block_size):
+            end_block_idx = min(i + block_size, num_blocks)
+            for row in range(i, end_block_idx):
+                for col in range(
+                    i,
+                    (row + 1 if self.attention == "unidirectional" else end_block_idx),
+                ):
+                    layout[h, row, col] = 1
+        return layout
+    def set_global_layout(self, h, layout):
+        """Sets global attention layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+                layout of all head in which global layout is set
+        """
+        num_blocks = layout.shape[1]
+        if self.global_block_end_indices is None:
+            for idx in self.global_block_indices:
+                # if global block idx is in the range of the sequence blocks
+                if idx < num_blocks:
+                    # global rows
+                    if self.horizontal_global_attention:
+                        layout[h, idx, :] = 1
+                    # global columns
+                    first_row = 0 if self.attention == "bidirectional" else idx
+                    layout[h, first_row:, idx] = 1
+        else:
+            for _, (start_idx, end_idx) in enumerate(
+                zip(self.global_block_indices, self.global_block_end_indices)
+            ):
+                # if global block idx is in the range of the sequence blocks
+                if start_idx < num_blocks:
+                    end_idx = min(end_idx, num_blocks)
+                    # global rows
+                    if self.horizontal_global_attention:
+                        layout[h, start_idx:end_idx, :] = 1
+                    # global columns
+                    first_row = 0 if self.attention == "bidirectional" else start_idx
+                    layout[h, first_row:, start_idx:end_idx] = 1
+        return layout
+    def make_layout(self, seq_len):
+        """Generates `Variable` sparsity layout used by each head in the sparse attention.
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `Variable`
+                sparsity layout of all head
+        """
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_random_layout(h, layout)
+            layout = self.set_local_layout(h, layout)
+            layout = self.set_global_layout(h, layout)
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout
+class BigBirdSparsityConfig(SparsityConfig):
+    """Configuration class to store `BigBird` sparsity configuration.
+    For more details about this sparsity config, please see `Big Bird: Transformers for
+    Longer Sequences`: https://arxiv.org/pdf/2007.14062.pdf
+    This class extends parent class of `SparsityConfig` and customizes it for `BigBird` sparsity.
+    """
+    def __init__(
+        self,
+        num_heads,
+        block_size=16,
+        different_layout_per_head=False,
+        num_random_blocks=1,
+        num_sliding_window_blocks=3,
+        num_global_blocks=1,
+        attention="bidirectional",
+    ):
+        """Initialize the BigBird Sparsity Pattern Config.
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block_size: optional: an integer determining the block size. Current implementation of
+                sparse self-attention is based on blocked sparse matrices. In which this parameter
+                defines size of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be assigned
+                a different sparsity layout; default is false and this will be satisfied based on
+                availability.
+             num_random_blocks: optional: an integer determining the number of random blocks in each
+                block row.
+             num_sliding_window_blocks: optional: an integer determining the number of blocks in sliding
+                local attention window.
+             num_global_blocks: optional: an integer determining how many consecutive blocks, starting
+                from index 0, are considered as global attention. Global block tokens will be attended
+                by all other block tokens and will attend to all other block tokens as well.
+             attention: optional: a string determining attention type. Attention can be `unidirectional`,
+                such as autoregressive models, in which tokens attend only to tokens appear before them
+                in the context. Considering that, the upper triangular of attention matrix is empty as
+                above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to
+                any other tokens before or after them. Then, the upper triangular part of the attention
+                matrix is mirror of the lower triangular in the above figure.
+        """
+        super().__init__(num_heads, block_size, different_layout_per_head)
+        self.num_random_blocks = num_random_blocks
+        self.num_sliding_window_blocks = num_sliding_window_blocks
+        self.num_global_blocks = num_global_blocks
+        if attention != "unidirectional" and attention != "bidirectional":
+            raise NotImplementedError(
+                'only "uni/bi-directional" attentions are supported for now!'
+            )
+        self.attention = attention
+    def set_random_layout(self, h, layout):
+        """Sets random attention layout used by the given head in the sparse attention.
+        Note) By default, it assumes there will be a unique random block layout for all heads; unless
+        `different_layout_per_head` parameter is set in which each head can have a different random layout.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+                layout of all head in which random layout is set
+        """
+        num_blocks = layout.shape[1]
+        if num_blocks < self.num_random_blocks:
+            raise ValueError(
+                f"""Number of random blocks, {self.num_random_blocks}, must be smaller than overall number
+                of blocks in a row, {num_blocks}!"""
+            )
+        for row in range(0, num_blocks):
+            sample_range = (
+                range(0, num_blocks)
+                if self.attention == "bidirectional"
+                else range(0, row + 1)
+            )
+            rnd_cols = random.sample(sample_range, self.num_random_blocks)
+            layout[h, row, rnd_cols] = 1
+        return layout
+    def set_sliding_window_layout(self, h, layout):
+        """Sets sliding local attention layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+                layout of all head in which local sliding window layout is set
+        """
+        num_blocks = layout.shape[1]
+        if num_blocks < self.num_sliding_window_blocks:
+            raise ValueError(
+                f"""Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than
+                overall number of blocks in a row, {num_blocks}!"""
+            )
+        w = self.num_sliding_window_blocks // 2
+        for row in range(0, num_blocks):
+            start = max(0, row - w)
+            end = min(row + w + 1, num_blocks)
+            layout[h, row, start:end] = 1
+        return layout
+    def set_global_layout_itc(self, h, layout):
+        """Sets global attention layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout
+                of all head in which global layout is set
+        """
+        num_blocks = layout.shape[1]
+        if num_blocks < self.num_global_blocks:
+            raise ValueError(
+                f"""Number of global blocks, {self.num_global_blocks}, must be smaller than overall number
+                of blocks in a row, {num_blocks}!"""
+            )
+        # global rows
+        layout[h, 0 : self.num_global_blocks, :] = 1
+        # global columns
+        layout[h, :, 0 : self.num_global_blocks] = 1
+        if self.attention == "unidirectional":
+            # zero out anything attending to the future
+            layout = torch.tril(layout)
+        return layout
+    def make_layout(self, seq_len):
+        """Generates `BigBird` sparsity layout used by each head in the sparse attention.
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `BigBird`
+             sparsity layout of all head
+        """
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_random_layout(h, layout)
+            layout = self.set_sliding_window_layout(h, layout)
+            layout = self.set_global_layout_itc(h, layout)
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout
+class BSLongformerSparsityConfig(SparsityConfig):
+    """Configuration class to store edited `Longformer` sparsity configuration.
+    Note) this is a block-sparse version of the Longformer which is slightly different than original
+    Longformer; which is element-wise sparsity.
+    For more details about this sparsity config, please see `Longformer:
+    The Long-Document Transformer`: https://arxiv.org/pdf/2004.05150.pdf
+    This class extends parent class of `SparsityConfig` and customizes it for `Longformer` sparsity.
+    """
+    def __init__(
+        self,
+        num_heads,
+        block_size=16,
+        different_layout_per_head=False,
+        num_sliding_window_blocks=3,
+        global_block_indices=[0],
+        global_block_end_indices=None,
+        attention="bidirectional",
+    ):
+        """Initialize the edited `Longformer` Sparsity Pattern Config.
+        For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
+        Arguments:
+             num_heads: required: an integer determining number of attention heads of the layer.
+             block_size: optional: an integer determining the block size. Current implementation of sparse
+                self-attention is based on blocked sparse matrices. In which this parameter defines size
+                of such blocks, `Block X Block`.
+             different_layout_per_head: optional: a boolean determining if each head should be assigned a
+                different sparsity layout; default is false and this will be satisfied based on
+                availability.
+             num_sliding_window_blocks: optional: an integer determining the number of blocks in sliding
+                local attention window.
+             global_block_indices: optional: a list of integers determining which blocks are considered
+                as global attention. Given indices, determine the blocks that all other token blocks
+                attend to and they attend to all other token blocks. Default value is only index 0.
+                Notice that if global_block_end_indices parameter is set, this parameter is used as
+                starting index of each global window.
+             global_block_end_indices: optional: a list of integers determining end indices of global
+                window blocks. By default this is not used. But if it is set, it must have the same size
+                of global_block_indices parameter, and combining this two parameters, for each index i,
+                blocks from global_block_indices[i] to global_block_end_indices[i] (exclusive) are
+                considered as global attention.
+             attention: optional: a string determining attention type. Attention can be `unidirectional`,
+                such as autoregressive models, in which tokens attend only to tokens appear before them
+                in the context. Considering that, the upper triangular of attention matrix is empty as
+                above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to
+                any other tokens before or after them. Then, the upper triangular part of the attention
+                matrix is mirror of the lower triangular in the above figure.
+        """
+        super().__init__(num_heads, block_size, different_layout_per_head)
+        self.num_sliding_window_blocks = num_sliding_window_blocks
+        self.global_block_indices = global_block_indices
+        self.attention = attention
+        if global_block_end_indices is not None:
+            if len(global_block_indices) != len(global_block_end_indices):
+                raise ValueError(
+                    f"""Global block start indices length, {len(global_block_indices)}, must be same as
+                    global block end indices length, {len(global_block_end_indices)}!"""
+                )
+            for _, (start_idx, end_idx) in enumerate(
+                zip(global_block_indices, global_block_end_indices)
+            ):
+                if start_idx >= end_idx:
+                    raise ValueError(
+                        f"""Global block start index, {start_idx}, must be smaller than global block end
+                        index, {end_idx}!"""
+                    )
+        self.global_block_end_indices = global_block_end_indices
+    def set_sliding_window_layout(self, h, layout):
+        """Sets sliding local attention layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout
+                of all head in which local sliding window layout is set
+        """
+        num_blocks = layout.shape[1]
+        if num_blocks < self.num_sliding_window_blocks:
+            raise ValueError(
+                f"""Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller
+                than overall number of blocks in a row, {num_blocks}!"""
+            )
+        w = self.num_sliding_window_blocks // 2
+        for row in range(0, num_blocks):
+            start = max(0, row - w)
+            end = min(row + w + 1, num_blocks)
+            layout[h, row, start:end] = 1
+        return layout
+    def set_global_layout(self, h, layout):
+        """Sets global attention layout used by the given head in the sparse attention.
+        Arguments:
+             h: required: an integer determining head index
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing
+                sparsity layout of all head; may not be completely set at this step
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity
+                layout of all head in which global layout is set
+        """
+        num_blocks = layout.shape[1]
+        if self.global_block_end_indices is None:
+            for idx in self.global_block_indices:
+                # if global block idx is in the range of the sequence blocks
+                if idx < num_blocks:
+                    # global rows
+                    layout[h, idx, :] = 1
+                    # global columns
+                    layout[h, :, idx] = 1
+        else:
+            for _, (start_idx, end_idx) in enumerate(
+                zip(self.global_block_indices, self.global_block_end_indices)
+            ):
+                # if global block idx is in the range of the sequence blocks
+                if start_idx < num_blocks:
+                    end_idx = min(end_idx, num_blocks)
+                    # global rows
+                    layout[h, start_idx:end_idx, :] = 1
+                    # global columns
+                    layout[h, :, start_idx:end_idx] = 1
+        if self.attention == "unidirectional":
+            layout = torch.tril(layout)
+        return layout
+    def make_layout(self, seq_len):
+        """Generates edited `Longformer` sparsity layout used by each head in the sparse attention.
+        Arguments:
+             seq_len: required: an integer determining number of attention heads of the layer.
+        Return:
+             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing `BSLongformer`
+                sparsity layout of all head
+        """
+        layout = self.setup_layout(seq_len)
+        for h in range(0, self.num_layout_heads):
+            layout = self.set_sliding_window_layout(h, layout)
+            layout = self.set_global_layout(h, layout)
+        layout = self.check_and_propagate_first_head_layout(layout)
+        return layout

.venv/lib/python3.11/site-packages/xformers/components/attention/utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+import torch
+# Reshapes key padding mask from (batch_size, src_len) -> (batch_size * num_heads 1, src_len)
+def reshape_key_padding_mask(
+    key_padding_mask: torch.Tensor, batched_dim: int
+) -> torch.Tensor:
+    assert key_padding_mask.ndim == 2
+    batch_size, src_len = key_padding_mask.size()
+    num_heads = batched_dim // batch_size
+    return _reshape_key_padding_mask(key_padding_mask, batch_size, src_len, num_heads)
+def _reshape_key_padding_mask(
+    key_padding_mask: torch.Tensor, batch_size: int, src_len: int, num_heads: int
+) -> torch.Tensor:
+    assert key_padding_mask.shape == (batch_size, src_len)
+    key_padding_mask = (
+        key_padding_mask.view(batch_size, 1, 1, src_len)
+        .expand(-1, num_heads, -1, -1)
+        .reshape(batch_size * num_heads, 1, src_len)
+    )
+    return key_padding_mask
+# Combine the attention mask and key padding mask into a single mask
+# Taken from https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py
+# Additive masking not yet supported
+def maybe_merge_masks(
+    att_mask: Optional[torch.Tensor],
+    key_padding_mask: Optional[torch.Tensor],
+    batch_size: int,
+    src_len: int,
+    num_heads: int,
+    tgt_len: Optional[int] = None,
+) -> Optional[torch.Tensor]:
+    if tgt_len is None:
+        tgt_len = src_len
+    if key_padding_mask is not None:
+        assert key_padding_mask.shape == (batch_size, src_len)
+        key_padding_mask = _reshape_key_padding_mask(
+            key_padding_mask, batch_size, src_len, num_heads
+        )
+        if att_mask is None:
+            # make sure dimensions of key padding mask are the same as those expected for att_mask
+            att_mask = key_padding_mask.expand(-1, tgt_len, -1)
+        # Assumption is that False means to mask.
+        elif att_mask.dtype == torch.bool:
+            att_mask = att_mask.logical_and(key_padding_mask)
+        else:
+            att_mask = att_mask.masked_fill(~key_padding_mask, float("-inf"))
+    return att_mask
+# Assumes that matrix passed in has had softmax applied to it.
+def iterative_pinv(softmax_mat: torch.Tensor, n_iter=6, pinverse_original_init=False):
+    """
+    Computing the Moore-Penrose inverse.
+    Use an iterative method from (Razavi et al. 2014) to approximate the Moore-Penrose inverse via efficient
+    matrix-matrix multiplications.
+    """
+    i = torch.eye(
+        softmax_mat.size(-1), device=softmax_mat.device, dtype=softmax_mat.dtype
+    )
+    k = softmax_mat
+    # The entries of K are positive and ||K||_{\infty} = 1 due to softmax
+    if pinverse_original_init:
+        # This original implementation is more conservative to compute coefficient of Z_0.
+        v = 1 / torch.max(torch.sum(k, dim=-2)) * k.transpose(-1, -2)
+    else:
+        # This is the exact coefficient computation, 1 / ||K||_1, of initialization of Z_0, leading to faster
+        # convergence.
+        v = (
+            1
+            / torch.max(torch.sum(k, dim=-2), dim=-1).values[:, None, None]
+            * k.transpose(-1, -2)
+        )
+    for _ in range(n_iter):
+        kv = torch.matmul(k, v)
+        v = torch.matmul(
+            0.25 * v,
+            13 * i - torch.matmul(kv, 15 * i - torch.matmul(kv, 7 * i - kv)),
+        )
+    return v
+def bool_mask_to_additive(
+    mask: torch.Tensor, dtype: Optional[torch.dtype] = torch.float32
+) -> torch.Tensor:
+    assert (
+        mask.dtype == torch.bool
+    ), "This util is meant to convert in between bool masks and additive ones"
+    mask_ = torch.zeros_like(mask, dtype=dtype)
+    mask_[~mask] = float("-inf")
+    return mask_

.venv/lib/python3.11/site-packages/xformers/components/feedforward/__init__.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from pathlib import Path
+from typing import Any, Callable, Dict, Set, Union
+from xformers.utils import (
+    generate_matching_config,
+    get_registry_decorator,
+    import_all_modules,
+)
+from .base import Feedforward, FeedforwardConfig  # noqa
+# CREDITS: Classy Vision registry mechanism
+FEEDFORWARD_REGISTRY: Dict[str, Any] = {}
+FEEDFORWARD_CLASS_NAMES: Set[str] = set()
+def build_feedforward(config: Union[Dict[str, Any], FeedforwardConfig]):
+    """Builds a feedforward from a config.
+    This assumes a 'name' key in the config which is used to determine what
+    attention class to instantiate. For instance, a config `{"name": "my_feedforward",
+    "foo": "bar"}` will find a class that was registered as "my_feedforward"
+    (see :func:`register_feedforward`) and call .from_config on it."""
+    if not isinstance(config, FeedforwardConfig):
+        config_instance = generate_matching_config(
+            config, FEEDFORWARD_REGISTRY[config["name"]].config
+        )
+    else:
+        config_instance = config
+    return FEEDFORWARD_REGISTRY[config_instance.name].constructor.from_config(
+        config_instance
+    )
+"""Registers a Feedforward subclass.
+    This decorator allows xFormers to instantiate a subclass of Feedforward
+    from a configuration file, even if the class itself is not part of the
+    xFormers framework. To use it, apply this decorator to a Feedforward
+    subclass, like this:
+    .. code-block:: python
+        @dataclass
+        class MyConfig:
+            ...
+        @register_feedforward('my_ff', MyConfig)
+        class MyFeedforward(Feedforward):
+            ...
+    To instantiate a feedforward from a configuration file, see :func:`build_feedforward`."""
+register_feedforward: Callable[
+    [str, Any], Callable[[Any], Any]
+] = get_registry_decorator(
+    FEEDFORWARD_REGISTRY, FEEDFORWARD_CLASS_NAMES, Feedforward, FeedforwardConfig
+)
+from .mlp import MLP  # noqa
+__all__ = [
+    "MLP",
+    "Feedforward",
+    "build_feedforward",
+    "register_feedforward",
+]
+# automatically import any Python files in the directory
+import_all_modules(str(Path(__file__).parent), "xformers.components.feedforward")