diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..c91f412e117492af7e3a5c04590ecd928f843879
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,116 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9871109e53967603743cd4772a7c58409838c6f4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,16 @@
+---
+tags:
+- kernels
+- cuda
+---
+MRA kernels for transformers
+### Performance
+
+
+
+
+
+
+
+
+
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7638cfc1e4313445af7f04da8b7836c83c9fe602
--- /dev/null
+++ b/benchmarks/benchmark.py
@@ -0,0 +1,128 @@
+import torch
+
+from kernels.benchmark import Benchmark
+
+
+def mm_to_sparse_reference(
+ dense_A: torch.Tensor,
+ dense_B: torch.Tensor,
+ indices: torch.Tensor,
+) -> torch.Tensor:
+ batch_size = dense_A.size(0)
+ A_num_block = dense_A.size(1)
+ B_num_block = dense_B.size(1)
+ dim = dense_A.size(2)
+ num_block = indices.size(1)
+
+ # Output: (batch_size, num_block, 32, 32)
+ sparse_C = torch.zeros(
+ batch_size, num_block, 32, 32, device=dense_A.device, dtype=dense_A.dtype
+ )
+
+ for b in range(batch_size):
+ for blk in range(num_block):
+ AB_idx = indices[b, blk].item()
+ A_idx = AB_idx // B_num_block
+ B_idx = AB_idx % B_num_block
+
+ A_block = dense_A[b, A_idx] # (dim, 32)
+ B_block = dense_B[b, B_idx] # (dim, 32)
+
+ # Kernel computes C = B.T @ A: (32, dim) @ (dim, 32) = (32, 32)
+ sparse_C[b, blk] = B_block.T @ A_block
+
+ return sparse_C
+
+
+class MRABenchmark(Benchmark):
+ seed: int = 42
+
+ def setup(self):
+ # Config matching the kernel's expected format
+ batch_size = 2
+ num_heads = 8
+ head_dim = 64
+ block_size = 32 # Fixed by kernel
+
+ A_num_block = 4
+ B_num_block = 4
+ total_blocks = A_num_block * B_num_block
+ indices_per_block = 4 # Must be divisible by 4
+
+ self.batch_heads = batch_size * num_heads
+
+ # dense_A: [batch_size, A_num_block, dim, 32]
+ self.dense_a = torch.randn(
+ self.batch_heads,
+ A_num_block,
+ head_dim,
+ block_size,
+ device=self.device,
+ dtype=torch.float32,
+ )
+ # dense_B: [batch_size, B_num_block, dim, 32]
+ self.dense_b = torch.randn(
+ self.batch_heads,
+ B_num_block,
+ head_dim,
+ block_size,
+ device=self.device,
+ dtype=torch.float32,
+ )
+ # indices: [batch_size, num_block]
+ self.indices = torch.randint(
+ 0,
+ total_blocks,
+ (self.batch_heads, indices_per_block),
+ device=self.device,
+ dtype=torch.int32,
+ )
+
+ def benchmark_base(self):
+ self.out = self.kernel.mm_to_sparse(self.dense_a, self.dense_b, self.indices)
+
+ def verify_base(self) -> torch.Tensor:
+ return mm_to_sparse_reference(self.dense_a, self.dense_b, self.indices)
+
+ def setup_large(self):
+ batch_size = 4
+ num_heads = 8
+ head_dim = 64
+ block_size = 32
+
+ A_num_block = 8
+ B_num_block = 8
+ total_blocks = A_num_block * B_num_block
+ indices_per_block = 8 # Must be divisible by 4
+
+ self.batch_heads = batch_size * num_heads
+
+ self.dense_a = torch.randn(
+ self.batch_heads,
+ A_num_block,
+ head_dim,
+ block_size,
+ device=self.device,
+ dtype=torch.float32,
+ )
+ self.dense_b = torch.randn(
+ self.batch_heads,
+ B_num_block,
+ head_dim,
+ block_size,
+ device=self.device,
+ dtype=torch.float32,
+ )
+ self.indices = torch.randint(
+ 0,
+ total_blocks,
+ (self.batch_heads, indices_per_block),
+ device=self.device,
+ dtype=torch.int32,
+ )
+
+ def benchmark_large(self):
+ self.out = self.kernel.mm_to_sparse(self.dense_a, self.dense_b, self.indices)
+
+ def verify_large(self) -> torch.Tensor:
+ return mm_to_sparse_reference(self.dense_a, self.dense_b, self.indices)
diff --git a/build.toml b/build.toml
new file mode 100644
index 0000000000000000000000000000000000000000..8f377c9a82e3b5d560661cf52e019d45793a83e2
--- /dev/null
+++ b/build.toml
@@ -0,0 +1,20 @@
+[general]
+name = "mra"
+universal = false
+
+[torch]
+src = [
+ "torch-ext/torch_binding.cpp",
+ "torch-ext/cuda_launch.h",
+]
+
+
+[kernel.mra]
+backend = "cuda"
+depends = ["torch"]
+src = [
+ "mra/cuda_kernel.cu",
+ "mra/cuda_kernel.h",
+ "mra/cuda_launch.cu",
+ "mra/cuda_launch.h",
+]
\ No newline at end of file
diff --git a/build/torch210-cu128-x86_64-windows/__init__.py b/build/torch210-cu128-x86_64-windows/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4246a14d6f911f7bbb378e8e6a02c3f0c5b479c
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd b/build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..442b5f9b72b0122c255630bb773bebd4913a2914
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa6a072526b11ba258ee3c95711b1582a501a40829c22bbd62b493730faee0ee
+size 795648
diff --git a/build/torch210-cu128-x86_64-windows/_ops.py b/build/torch210-cu128-x86_64-windows/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..150415099aa4c017f4ac4b1f6181ac6e917033a3
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_6ec000c
+ops = torch.ops._mra_cuda_6ec000c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_6ec000c::{op_name}"
diff --git a/build/torch210-cu128-x86_64-windows/metadata.json b/build/torch210-cu128-x86_64-windows/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d6e976250acb280ddaf231891a1822d3fdfaf30
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cu128-x86_64-windows/mra/__init__.py b/build/torch210-cu128-x86_64-windows/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc434ef44e63409acb52a8f3fff54a4adc46ed6a
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ae1d890a11ab0ff496ccb87f3ad87d754c0c7889
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de75db12cb29ce706eba61ef07d7e74f00deea71749fdd8b7bf2d56bf7178105
+size 2567952
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/metadata.json b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..73521ec3d40b855563f5a97cbfb4b8804b2a1213
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0+PTX"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/mra/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c9cd671b8d98daf635393a123fc08da512ef7b7f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cc021351bfa4e923b15d186877cddf3d935d6223a369f40ffabb12507536e90
+size 2451480
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..73521ec3d40b855563f5a97cbfb4b8804b2a1213
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0+PTX"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/mra/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..46913933cefe498421943d4e6306ff0fbcdd51be
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c94fe47bd01e60165517510cb90d9f8c1afa4b8092c7a7a25ef971c73a11f41
+size 2830296
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/metadata.json b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8434d821584ca7b32e25d9cda24dae18561df8b
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/mra/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..0fa8caf69cc6d291fc1318913f6fa7b3cb3334f3
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b1ce65f7d848240c848986a70ec25bc6bf1bc53c3046df1461649630afb81f8
+size 2719848
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8434d821584ca7b32e25d9cda24dae18561df8b
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/mra/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..61037904646fad4656036cc26680e01c3a6eaa33
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1e26fb0737c8f8451d052d2514c36d64150212470214009acf0493b5862fe80
+size 2767768
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/metadata.json b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..6255e63ff003b42aa6279cd430bf3bd493c6aa31
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "11.0",
+ "12.0+PTX",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/mra/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..a3930bbdc9cd3edc84fdf0ba974d60340956a0cc
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26e6338feb8e2e4589397574e56ccf8b1e2761714e6ae0b5a474030b9e95f4f5
+size 2641368
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..6255e63ff003b42aa6279cd430bf3bd493c6aa31
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "11.0",
+ "12.0+PTX",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/mra/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..35f99fe047428839c750bd804103c51b0d749735
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb19769c43d841448daf6deb84ff8358cef905b1df26aed4d60bf38b1ab819e0
+size 2567952
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/metadata.json b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..73521ec3d40b855563f5a97cbfb4b8804b2a1213
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0+PTX"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/mra/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..bc856d8e3328617afcbcaac25244fbac6841c5fc
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5dd2ac9defcbaf5d03db15bc1bd55476e4520c3eb91b157a6f2488d37a16f011
+size 2451480
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/metadata.json b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..73521ec3d40b855563f5a97cbfb4b8804b2a1213
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0+PTX"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/mra/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch211-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..588cb13c4a129a7310b33d314b4cd2161d596f2f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2de1386fdd980b3e5fabe1139ae026077ac82ba7c7b6ac48c516a8f67674501
+size 2830296
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/metadata.json b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8434d821584ca7b32e25d9cda24dae18561df8b
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/mra/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch211-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..3f37172ac862376c934dd0bbc7cb7ee1e54aefc3
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55c46f680dc0f314e39ae37b138c9a7c74cf12b77fc41d8a71f4de0692803a92
+size 2719848
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/metadata.json b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8434d821584ca7b32e25d9cda24dae18561df8b
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/mra/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch211-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..8eba2522fcb8af286faa83846e763fd683765f44
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0592b2f2d4720899c34ab36c443060f73fa53e2aa2cbd94a42ce99f352e5ee9a
+size 2767768
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/metadata.json b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..6255e63ff003b42aa6279cd430bf3bd493c6aa31
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "11.0",
+ "12.0+PTX",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/mra/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch211-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c6a6fc794724897092a13f3ceefa8a9f7eeb07c2
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1100047c3ad05fa54430cad675e934896bdfb8eeae70903e1e452d9a30cc789
+size 2641368
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/metadata.json b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..6255e63ff003b42aa6279cd430bf3bd493c6aa31
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "11.0",
+ "12.0+PTX",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/mra/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/mra/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/mra/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66aade4ca52539fe67db2b460d17f3d8acd36581
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ab6dd7f47dc2e72bfda4f42eb02def383d5b18f
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_9e0f4db.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_9e0f4db.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c45d517421f6a935538eb43410bedbb6aefda177
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_9e0f4db.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d0971139abac58983b682ff6200585383f27f5050766eda054f5cbd015cf011
+size 2289080
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/mra/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/mra/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd6050d31c8fe3317a842e237acda7577b15f1e9
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/mra/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_9e0f4db
+ops = torch.ops._mra_9e0f4db
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_9e0f4db::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/mra/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/mra/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fb3495aa08770e82546f8a8adc7acae2113541e
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..832adc6bf5fbd1b66095d05178297910786609f1
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..38921bfafeaa5e6c8c6e3add913c9b6004b24d4d
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:639a1f35e23433584b3baea0105a7b3005c7b6bfbc55eb2b279c2ddfc7c3e656
+size 2334464
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/mra/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/mra/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd6050d31c8fe3317a842e237acda7577b15f1e9
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/mra/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_9e0f4db
+ops = torch.ops._mra_9e0f4db
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_9e0f4db::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/mra/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/mra/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..899e51bb9291e06749e554329c5a20596d3ac0cf
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/mra/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..794b60b0e72c6bd52cda3d8484df427a3ba814bb
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/mra/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..6fdff80753d3442bb860928ba35dcbcfb3fee662
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9950985a6a1b46593e55a3b8b4b93f7c691d5a37737ce24a0afb2a32e3b0bba9
+size 2602624
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/mra/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/mra/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd6050d31c8fe3317a842e237acda7577b15f1e9
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/mra/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_9e0f4db
+ops = torch.ops._mra_9e0f4db
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_9e0f4db::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..1f5b8e58729c4a5e3cabfa6c87f500557508abb5
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a9c897734397d26fc2b0a86a4f4fb6a60762ed9ccc47490bc9b7a2452926440
+size 2446064
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8371d6e822162ec41a463169da3cea0a507f4b62
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_41ac1dc
+ops = torch.ops._mra_41ac1dc
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_41ac1dc::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/metadata.json b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+ "version": 1,
+ "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/mra/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..458a52a9b62c17b819173bf90c745f5b360ec9a1
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b6b507f6b840954a5193733b155258fcfdb9525b42bb74923eff4d8e8979761
+size 2714440
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8371d6e822162ec41a463169da3cea0a507f4b62
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_41ac1dc
+ops = torch.ops._mra_41ac1dc
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_41ac1dc::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/metadata.json b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+ "version": 1,
+ "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/mra/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_mra_41ac1dc.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/_mra_41ac1dc.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..0cc0189fcb60826150c637579495d886f3e5a502
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_mra_41ac1dc.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc073211a7e153133529c099f8eec9bd97369907c52dae29ee7a3f68d4a063c5
+size 2748224
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8371d6e822162ec41a463169da3cea0a507f4b62
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_41ac1dc
+ops = torch.ops._mra_41ac1dc
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_41ac1dc::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/metadata.json b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+ "version": 1,
+ "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/mra/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so b/build/torch29-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c753e514125149c316db6ffad654f32b64c81fda
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60e59a7c3b2e88d981aa9c670310b766a8697ff65e4a95ceb208945629e93000
+size 2566160
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fdee5dbb111943beceda92293af1bfff5e51c99
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_8d73b81
+ops = torch.ops._mra_cuda_8d73b81
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_8d73b81::{op_name}"
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/metadata.json b/build/torch29-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..73521ec3d40b855563f5a97cbfb4b8804b2a1213
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0+PTX"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/mra/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so b/build/torch29-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..7eaaecd6d488083818e87df16a5ade1dd3d13279
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a6eb0aa709d3bcbd98dde99e2aaa98a980942d70f23ccf003ea534b9d21edbb
+size 2446064
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fdee5dbb111943beceda92293af1bfff5e51c99
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_8d73b81
+ops = torch.ops._mra_cuda_8d73b81
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_8d73b81::{op_name}"
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/metadata.json b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..73521ec3d40b855563f5a97cbfb4b8804b2a1213
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0+PTX"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/mra/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so b/build/torch29-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..9460471c557ac8ec531c65d92452cb5f5ef3fc04
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:126965553f20913e97d9827bb6363cf1147d8e673040161e84ef5e6f66d46186
+size 2828496
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fdee5dbb111943beceda92293af1bfff5e51c99
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_8d73b81
+ops = torch.ops._mra_cuda_8d73b81
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_8d73b81::{op_name}"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/metadata.json b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8434d821584ca7b32e25d9cda24dae18561df8b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/mra/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so b/build/torch29-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..07daae02b55a986c93576de4f465e6b5a4ee0885
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:209c5944a8fd85bfe8328cbad3ee5b7bdd68864e463a7e18a76109f6c0398fc7
+size 2714440
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fdee5dbb111943beceda92293af1bfff5e51c99
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_8d73b81
+ops = torch.ops._mra_cuda_8d73b81
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_8d73b81::{op_name}"
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8434d821584ca7b32e25d9cda24dae18561df8b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/mra/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch29-cxx11-cu129-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ee6724b261c1f3e4981a5151829e55237b203b3a
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d863ce68196a244a628191e4b42361baa1ff5e4309afe704e090fa6ee237c57e
+size 2829920
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/metadata.json b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8434d821584ca7b32e25d9cda24dae18561df8b
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/mra/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so b/build/torch29-cxx11-cu129-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..61a3e619952db80e45c22a6d254d90672e6c0bdf
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50483a6e187b54c590af77b0ca375bd072b4b8be3c36ed46acba254ce285f73f
+size 2748224
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3578e4a36127a6f124954fae2199e9c83726251
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_c1eaa2d::{op_name}"
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/metadata.json b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e8434d821584ca7b32e25d9cda24dae18561df8b
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,20 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "10.1",
+ "12.0+PTX",
+ "7.0",
+ "7.2",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/mra/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so b/build/torch29-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..00069af0fb4ae956009c53abf2ac7ee082777c4a
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a893b370f285f0b730668582381cb6158144d29136d956ef6c066a2c8518e22c
+size 2765976
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fdee5dbb111943beceda92293af1bfff5e51c99
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_8d73b81
+ops = torch.ops._mra_cuda_8d73b81
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_8d73b81::{op_name}"
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/metadata.json b/build/torch29-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..6255e63ff003b42aa6279cd430bf3bd493c6aa31
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "11.0",
+ "12.0+PTX",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/mra/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so b/build/torch29-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..7fad98df6a7df6fedaf180db67bf86f2e5f86f1e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0339fd04b1ef7401a7d592f7cf6bfe8092a63c3fc97d64562c48f0cd3944d40
+size 2640048
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fdee5dbb111943beceda92293af1bfff5e51c99
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _mra_cuda_8d73b81
+ops = torch.ops._mra_cuda_8d73b81
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_mra_cuda_8d73b81::{op_name}"
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/metadata.json b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..6255e63ff003b42aa6279cd430bf3bd493c6aa31
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "11.0",
+ "12.0+PTX",
+ "7.5",
+ "8.0",
+ "8.6",
+ "8.7",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/mra/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/mra/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000000000000000000000000000000000000..85b5d60a855bf4c19555cc9b8de8ca88d6fd3ae9
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,168 @@
+{
+ "nodes": {
+ "flake-compat": {
+ "locked": {
+ "lastModified": 1747046372,
+ "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+ "owner": "edolstra",
+ "repo": "flake-compat",
+ "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+ "type": "github"
+ },
+ "original": {
+ "owner": "edolstra",
+ "repo": "flake-compat",
+ "type": "github"
+ }
+ },
+ "flake-compat_2": {
+ "locked": {
+ "lastModified": 1747046372,
+ "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+ "owner": "edolstra",
+ "repo": "flake-compat",
+ "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+ "type": "github"
+ },
+ "original": {
+ "owner": "edolstra",
+ "repo": "flake-compat",
+ "type": "github"
+ }
+ },
+ "flake-utils": {
+ "inputs": {
+ "systems": "systems"
+ },
+ "locked": {
+ "lastModified": 1731533236,
+ "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "flake-utils_2": {
+ "inputs": {
+ "systems": "systems_2"
+ },
+ "locked": {
+ "lastModified": 1731533236,
+ "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "hf-nix": {
+ "inputs": {
+ "flake-compat": "flake-compat_2",
+ "flake-utils": "flake-utils_2",
+ "nixpkgs": "nixpkgs"
+ },
+ "locked": {
+ "lastModified": 1759851564,
+ "narHash": "sha256-Xybkhm0FM/VzlZ5WndTYq/X/9MAeddd4EQ2Vz8GdkOA=",
+ "owner": "huggingface",
+ "repo": "hf-nix",
+ "rev": "351655d9f124805ed7c1193aa61550ce245f4570",
+ "type": "github"
+ },
+ "original": {
+ "owner": "huggingface",
+ "repo": "hf-nix",
+ "type": "github"
+ }
+ },
+ "kernel-builder": {
+ "inputs": {
+ "flake-compat": "flake-compat",
+ "flake-utils": "flake-utils",
+ "hf-nix": "hf-nix",
+ "nixpkgs": [
+ "kernel-builder",
+ "hf-nix",
+ "nixpkgs"
+ ]
+ },
+ "locked": {
+ "lastModified": 1760035358,
+ "narHash": "sha256-N5vmCrgwcIluPclf/hmnofLK77EJJYh5PR8SRvw++es=",
+ "owner": "huggingface",
+ "repo": "kernel-builder",
+ "rev": "a48cbd19ae7e425dfc1865188ef06dac43ab9244",
+ "type": "github"
+ },
+ "original": {
+ "owner": "huggingface",
+ "repo": "kernel-builder",
+ "type": "github"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1755963616,
+ "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
+ "owner": "nixos",
+ "repo": "nixpkgs",
+ "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nixos",
+ "ref": "nixos-unstable-small",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "kernel-builder": "kernel-builder"
+ }
+ },
+ "systems": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ },
+ "systems_2": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000000000000000000000000000000000000..9d5ee16d31cb9da4c4174b81c095f11905027def
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,17 @@
+{
+ description = "Flake for mra kernels";
+
+ inputs = {
+ kernel-builder.url = "github:huggingface/kernel-builder";
+ };
+
+ outputs =
+ {
+ self,
+ kernel-builder,
+ }:
+ kernel-builder.lib.genFlakeOutputs {
+ path = ./.;
+ rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+ };
+}
diff --git a/media/benches_dark_animation.svg b/media/benches_dark_animation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..e5c9745483eea517aa1e2a6b703f25184e5228b5
--- /dev/null
+++ b/media/benches_dark_animation.svg
@@ -0,0 +1,33 @@
+
\ No newline at end of file
diff --git a/media/benches_dark_latency.svg b/media/benches_dark_latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..452f7ae043709df21bf2b8ce8559fc330657d601
--- /dev/null
+++ b/media/benches_dark_latency.svg
@@ -0,0 +1,1869 @@
+
+
+
diff --git a/media/benches_dark_throughput.svg b/media/benches_dark_throughput.svg
new file mode 100644
index 0000000000000000000000000000000000000000..0c87824e32ffca10c29a702249b977dd76697096
--- /dev/null
+++ b/media/benches_dark_throughput.svg
@@ -0,0 +1,2090 @@
+
+
+
diff --git a/media/benches_light_animation.svg b/media/benches_light_animation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..64deeb89de5ff8540a93302f3c43d5de2a3010f5
--- /dev/null
+++ b/media/benches_light_animation.svg
@@ -0,0 +1,33 @@
+
\ No newline at end of file
diff --git a/media/benches_light_latency.svg b/media/benches_light_latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..1d664591837acf68eed83daf12fe41953e4c07e2
--- /dev/null
+++ b/media/benches_light_latency.svg
@@ -0,0 +1,1869 @@
+
+
+
diff --git a/media/benches_light_throughput.svg b/media/benches_light_throughput.svg
new file mode 100644
index 0000000000000000000000000000000000000000..115cbc2c65a603ff924d0181c81b97e2935a851a
--- /dev/null
+++ b/media/benches_light_throughput.svg
@@ -0,0 +1,2090 @@
+
+
+
diff --git a/mra/cuda_kernel.cu b/mra/cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..87ed89052873813153786bd416a981d3e5279af9
--- /dev/null
+++ b/mra/cuda_kernel.cu
@@ -0,0 +1,383 @@
+#include "cuda_kernel.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void index_max_cuda_kernel(
+ float *index_vals, // [batch_size, 32, num_block]
+ int *indices, // [batch_size, num_block]
+ float *max_vals, // [batch_size, A_num_block * 32]
+ float *max_vals_scatter, // [batch_size, 32, num_block]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long num_block
+) {
+
+ long batch_idx = blockIdx.x;
+
+ long thread_idx = threadIdx.x;
+ long num_thread = blockDim.x;
+
+ extern __shared__ float buffer[];
+ int *max_buffer = (int*)buffer;
+
+ for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+ int idx = i + thread_idx;
+ if (idx < A_num_block * 32) {
+ max_buffer[idx] = -1e8;
+ }
+ }
+ __syncthreads();
+
+ int *indices_pt = &indices[batch_idx * num_block];
+ float *index_vals_pt = &index_vals[batch_idx * num_block * 32];
+
+ for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+ int idx = idx_start + thread_idx;
+ int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+ atomicMax(&max_buffer[A_block_idx * 32 + idx / num_block], (int)(index_vals_pt[idx] * 1000));
+ }
+ __syncthreads();
+
+ float *max_vals_pt = &max_vals[batch_idx * A_num_block * 32];
+ for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+ int idx = i + thread_idx;
+ if (idx < A_num_block * 32) {
+ max_vals_pt[idx] = (float)max_buffer[idx] / 1000.;
+ }
+ }
+
+ float *max_vals_scatter_pt = &max_vals_scatter[batch_idx * num_block * 32];
+ for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+ int idx = idx_start + thread_idx;
+ int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+ max_vals_scatter_pt[idx] = (float)max_buffer[A_block_idx * 32 + idx / num_block] / 1000.;
+ }
+
+}
+
+__global__ void mm_to_sparse_cuda_kernel(
+ float *dense_A, // [batch_size, A_num_block, dim, 32]
+ float *dense_B, // [batch_size, B_num_block, dim, 32]
+ int *indices, // [batch_size, num_block]
+ float *sparse_C, // [batch_size, num_block, 32, 32]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long dim,
+ long num_block
+) {
+
+ long batch_idx = blockIdx.y;
+ long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+ long thread_idx = threadIdx.x;
+
+ __shared__ float buffer[4096];
+ float *A_buffer = &buffer[threadIdx.y * 1024]; // [2, 8, 32]
+ float *B_buffer = &buffer[threadIdx.y * 1024 + 512]; // [2, 8, 32]
+
+ long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+ long AB_block_idx = indices[batch_idx__block_idx];
+ float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * dim * 32];
+ float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * dim * 32];
+
+ int reg_1_idx = thread_idx / 8; // [0000000011111111222222223333333344444444555555556666666677777777]
+ int reg_2_idx = thread_idx % 8; // [0123456701234567012345670123456701234567012345670123456701234567]
+
+ float reg_1[8];
+ float reg_2[8];
+
+ float reg_array[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ A_buffer[i * 64 + thread_idx] = dense_A_pt[i * 64 + thread_idx];
+ B_buffer[i * 64 + thread_idx] = dense_B_pt[i * 64 + thread_idx];
+ }
+
+ __syncthreads();
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ reg_1[i] = A_buffer[reg_1_idx * 4 + i];
+ reg_2[i] = B_buffer[reg_2_idx * 4 + i];
+ }
+
+ for (int dim_stride = 1; dim_stride < (dim / 8); dim_stride++) {
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ A_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_A_pt[dim_stride * 256 + i * 64 + thread_idx];
+ B_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_B_pt[dim_stride * 256 + i * 64 + thread_idx];
+ }
+
+ #pragma unroll
+ for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+ reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+ }
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ #pragma unroll
+ for (int j = 0; j < 4; j++) {
+ reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+ }
+ }
+ }
+
+ __syncthreads();
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ reg_1[i] = A_buffer[(dim_stride % 2) * 256 + reg_1_idx * 4 + i];
+ reg_2[i] = B_buffer[(dim_stride % 2) * 256 + reg_2_idx * 4 + i];
+ }
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ #pragma unroll
+ for (int j = 0; j < 4; j++) {
+ reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+ }
+ }
+
+ }
+
+ #pragma unroll
+ for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+ reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+ }
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ #pragma unroll
+ for (int j = 0; j < 4; j++) {
+ reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+ }
+ }
+ }
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ #pragma unroll
+ for (int j = 0; j < 4; j++) {
+ reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+ }
+ }
+ __syncthreads();
+
+ float *C_buffer = &buffer[threadIdx.y * 1024]; // [32, 32]
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ #pragma unroll
+ for (int j = 0; j < 4; j++) {
+ C_buffer[(reg_2_idx * 4 + j) * 32 + reg_1_idx * 4 + i] = reg_array[i * 4 + j];
+ }
+ }
+ __syncthreads();
+
+ float *sparse_C_pt = &sparse_C[batch_idx__block_idx * 1024];
+
+ #pragma unroll
+ for (int i = 0; i < 16; i++) {
+ sparse_C_pt[i * 64 + thread_idx] = C_buffer[i * 64 + thread_idx];
+ }
+
+}
+
+__global__ void sparse_dense_mm_cuda_kernel(
+ float *sparse_A, // [batch_size, num_block, 32, 32]
+ int *indices, // [batch_size, num_block]
+ float *dense_B, // [batch_size, B_num_block, dim, 32]
+ float *dense_C, // [batch_size, A_num_block, dim, 32]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long dim,
+ long num_block
+) {
+
+ long batch_idx = blockIdx.y;
+ long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+ long thread_idx = threadIdx.x;
+
+ __shared__ float buffer[6144];
+ float *A_buffer = &buffer[threadIdx.y * 3072]; // [32, 32]
+ float *B_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [32, 64]
+
+ long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+ float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ A_buffer[i * 128 + thread_idx] = sparse_A_pt[i * 128 + thread_idx];
+ }
+
+ long AB_block_idx = indices[batch_idx__block_idx];
+ float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * 32 * dim];
+ float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32 * dim];
+
+ // [0000000011111111222222223333333344444444555555556666666677777777]
+ // [0123456701234567012345670123456701234567012345670123456701234567]
+ int reg_1_idx = thread_idx / 8;
+ int reg_2_idx = thread_idx % 8;
+
+ float reg_1[8];
+ float reg_2[8];
+
+ float reg_array[16];
+
+ for (int dim_stride = 0; dim_stride < dim; dim_stride = dim_stride + 64) {
+
+ #pragma unroll
+ for (int i = 0; i < 16; i++) {
+ B_buffer[i * 128 + thread_idx] = dense_B_pt[dim_stride * 32 + i * 128 + thread_idx];
+ }
+
+ #pragma unroll
+ for (int i = 0; i < 16; i++) {
+ reg_array[i] = 0;
+ }
+
+ __syncthreads();
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ reg_1[i] = B_buffer[(reg_1_idx * 4 + i) * 32];
+ reg_2[i] = A_buffer[reg_2_idx * 4 + i];
+ }
+
+ #pragma unroll
+ for (int mini_dim_idx = 1; mini_dim_idx < 32; mini_dim_idx++) {
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ reg_1[(mini_dim_idx % 2) * 4 + i] = B_buffer[(reg_1_idx * 4 + i) * 32 + mini_dim_idx];
+ reg_2[(mini_dim_idx % 2) * 4 + i] = A_buffer[mini_dim_idx * 32 + reg_2_idx * 4 + i];
+ }
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ #pragma unroll
+ for (int j = 0; j < 4; j++) {
+ reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+ }
+ }
+ }
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ #pragma unroll
+ for (int j = 0; j < 4; j++) {
+ reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+ }
+ }
+
+ __syncthreads();
+
+ float *C_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [64, 32]
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ #pragma unroll
+ for (int j = 0; j < 4; j++) {
+ C_buffer[(reg_1_idx * 4 + i) * 32 + reg_2_idx * 4 + j] = reg_array[i * 4 + j];
+ }
+ }
+ __syncthreads();
+
+ #pragma unroll
+ for (int i = 0; i < 16; i++) {
+ atomicAdd(&dense_C_pt[dim_stride * 32 + i * 128 + thread_idx], C_buffer[i * 128 + thread_idx]);
+ }
+ __syncthreads();
+
+ }
+
+}
+
+
+__global__ void reduce_sum_cuda_kernel(
+ float *sparse_A, // [batch_size, num_block, 32, 32]
+ int *indices, // [batch_size, num_block]
+ float *dense_C, // [batch_size, A_num_block, 32]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long num_block
+) {
+
+ long batch_idx = blockIdx.y;
+ long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+ long thread_idx = threadIdx.x;
+
+ long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+ long AB_block_idx = indices[batch_idx__block_idx];
+ float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+
+ float reg_array[16];
+ float value = 0;
+
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ reg_array[i] = sparse_A_pt[i * 32 + thread_idx];
+ }
+ #pragma unroll
+ for (int stride = 8; stride < 32; stride = stride + 8) {
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ reg_array[(stride + i) % 16] = sparse_A_pt[(stride + i) * 32 + thread_idx];
+ }
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ value = value + reg_array[(stride - 8 + i) % 16];
+ }
+ }
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ value = value + reg_array[8 + i];
+ }
+
+ float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+
+ atomicAdd(&dense_C_pt[thread_idx], value);
+
+}
+
+__global__ void scatter_cuda_kernel(
+ float *dense_A, // [batch_size, A_num_block, 32]
+ int *indices, // [batch_size, num_block]
+ float *sparse_C, // [batch_size, num_block, 32, 32]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long num_block
+) {
+
+ long batch_idx = blockIdx.y;
+ long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+ long thread_idx = threadIdx.x;
+
+ long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+ long AB_block_idx = indices[batch_idx__block_idx];
+ float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+ float *sparse_C_pt = &sparse_C[(batch_idx * num_block + block_idx) * 1024];
+
+ float value = dense_A_pt[thread_idx];
+
+ #pragma unroll
+ for (int i = 0; i < 32; i++) {
+ sparse_C_pt[i * 32 + thread_idx] = value;
+ }
+
+}
diff --git a/mra/cuda_kernel.h b/mra/cuda_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a95b46f7d159b11851143710034cf80c20aa6bf8
--- /dev/null
+++ b/mra/cuda_kernel.h
@@ -0,0 +1,59 @@
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+#define OPTIMAL_THREADS 256
+
+__global__ void index_max_cuda_kernel(
+ float *index_vals, // [batch_size, 32, num_block]
+ int *indices, // [batch_size, num_block]
+ float *max_vals, // [batch_size, A_num_block * 32]
+ float *max_vals_scatter, // [batch_size, 32, num_block]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long num_block
+);
+
+__global__ void mm_to_sparse_cuda_kernel(
+ float *dense_A, // [batch_size, A_num_block, dim, 32]
+ float *dense_B, // [batch_size, B_num_block, dim, 32]
+ int *indices, // [batch_size, num_block]
+ float *sparse_C, // [batch_size, num_block, 32, 32]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long dim,
+ long num_block
+);
+
+__global__ void sparse_dense_mm_cuda_kernel(
+ float *sparse_A, // [batch_size, num_block, 32, 32]
+ int *indices, // [batch_size, num_block]
+ float *dense_B, // [batch_size, B_num_block, dim, 32]
+ float *dense_C, // [batch_size, A_num_block, dim, 32]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long dim,
+ long num_block
+);
+
+__global__ void reduce_sum_cuda_kernel(
+ float *sparse_A, // [batch_size, num_block, 32, 32]
+ int *indices, // [batch_size, num_block]
+ float *dense_C, // [batch_size, A_num_block, 32]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long num_block
+);
+
+__global__ void scatter_cuda_kernel(
+ float *dense_A, // [batch_size, A_num_block, 32]
+ int *indices, // [batch_size, num_block]
+ float *sparse_C, // [batch_size, num_block, 32, 32]
+ long batch_size,
+ long A_num_block,
+ long B_num_block,
+ long num_block
+);
diff --git a/mra/cuda_launch.cu b/mra/cuda_launch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd9565875380d3267191e102ca9f4ff5dc381a0e
--- /dev/null
+++ b/mra/cuda_launch.cu
@@ -0,0 +1,154 @@
+#include
+#include
+#include "cuda_launch.h"
+#include "cuda_kernel.h"
+#include
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector index_max_kernel(
+ at::Tensor index_vals, // [batch_size, 32, num_block]
+ at::Tensor indices, // [batch_size, num_block],
+ int A_num_block,
+ int B_num_block
+) {
+ int batch_size = indices.size(0);
+ int num_block = indices.size(1);
+
+ at::Tensor max_vals = at::zeros({batch_size, A_num_block * 32}, index_vals.options());
+ at::Tensor max_vals_scatter = at::zeros({batch_size, 32, num_block}, index_vals.options());
+
+ dim3 threads(256);
+ dim3 blocks(batch_size);
+ int shared_mem = A_num_block * 32 * sizeof(float);
+
+ index_max_cuda_kernel<<>>(
+ index_vals.data_ptr(),
+ indices.data_ptr(),
+ max_vals.data_ptr(),
+ max_vals_scatter.data_ptr(),
+ batch_size,
+ A_num_block,
+ B_num_block,
+ num_block
+ );
+
+ return {max_vals, max_vals_scatter};
+}
+
+at::Tensor mm_to_sparse_kernel(
+ at::Tensor dense_A, // [batch_size, A_num_block, dim, 32]
+ at::Tensor dense_B, // [batch_size, B_num_block, dim, 32]
+ at::Tensor indices // [batch_size, num_block]
+) {
+ int batch_size = dense_A.size(0);
+ int A_num_block = dense_A.size(1);
+ int B_num_block = dense_B.size(1);
+ int dim = dense_A.size(2);
+ int num_block = indices.size(1);
+
+ at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+ dim3 threads(64, 4);
+ dim3 blocks(num_block / 4, batch_size);
+
+ mm_to_sparse_cuda_kernel<<>>(
+ dense_A.data_ptr(),
+ dense_B.data_ptr(),
+ indices.data_ptr(),
+ sparse_C.data_ptr(),
+ batch_size,
+ A_num_block,
+ B_num_block,
+ dim,
+ num_block
+ );
+
+ return sparse_C;
+}
+
+at::Tensor sparse_dense_mm_kernel(
+ at::Tensor sparse_A, // [batch_size, num_block, 32, 32]
+ at::Tensor indices, // [batch_size, num_block]
+ at::Tensor dense_B, // [batch_size, B_num_block, dim, 32]
+ int A_num_block
+) {
+ int batch_size = sparse_A.size(0);
+ int num_block = sparse_A.size(1);
+ int B_num_block = dense_B.size(1);
+ int dim = dense_B.size(2);
+
+ at::Tensor dense_C = at::zeros({batch_size, A_num_block, dim, 32}, dense_B.options());
+
+ dim3 threads(128, 2);
+ dim3 blocks(num_block / 2, batch_size);
+
+ sparse_dense_mm_cuda_kernel<<>>(
+ sparse_A.data_ptr(),
+ indices.data_ptr(),
+ dense_B.data_ptr(),
+ dense_C.data_ptr(),
+ batch_size,
+ A_num_block,
+ B_num_block,
+ dim,
+ num_block
+ );
+
+ return dense_C;
+}
+
+at::Tensor reduce_sum_kernel(
+ at::Tensor sparse_A, // [batch_size, num_block, 32, 32]
+ at::Tensor indices, // [batch_size, num_block]
+ int A_num_block,
+ int B_num_block
+) {
+ int batch_size = sparse_A.size(0);
+ int num_block = sparse_A.size(1);
+
+ at::Tensor dense_C = at::zeros({batch_size, A_num_block, 32}, sparse_A.options());
+
+ dim3 threads(32, 4);
+ dim3 blocks(num_block / 4, batch_size);
+
+ reduce_sum_cuda_kernel<<>>(
+ sparse_A.data_ptr(),
+ indices.data_ptr(),
+ dense_C.data_ptr(),
+ batch_size,
+ A_num_block,
+ B_num_block,
+ num_block
+ );
+
+ return dense_C;
+}
+
+at::Tensor scatter_kernel(
+ at::Tensor dense_A, // [batch_size, A_num_block, 32]
+ at::Tensor indices, // [batch_size, num_block]
+ int B_num_block
+) {
+ int batch_size = dense_A.size(0);
+ int A_num_block = dense_A.size(1);
+ int num_block = indices.size(1);
+
+ at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+ dim3 threads(32, 4);
+ dim3 blocks(num_block / 4, batch_size);
+
+ scatter_cuda_kernel<<>>(
+ dense_A.data_ptr(),
+ indices.data_ptr(),
+ sparse_C.data_ptr(),
+ batch_size,
+ A_num_block,
+ B_num_block,
+ num_block
+ );
+
+ return sparse_C;
+}
diff --git a/mra/cuda_launch.h b/mra/cuda_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a8950a657c50ff70351eb43e4862f30e49f36e4
--- /dev/null
+++ b/mra/cuda_launch.h
@@ -0,0 +1,39 @@
+#include
+#include
+#include
+
+#define min(a, b) ((a)<(b)?(a):(b))
+#define max(a, b) ((a)>(b)?(a):(b))
+
+std::vector index_max_kernel(
+ at::Tensor index_vals,
+ at::Tensor indices,
+ int A_num_block,
+ int B_num_block
+);
+
+at::Tensor mm_to_sparse_kernel(
+ at::Tensor dense_A,
+ at::Tensor dense_B,
+ at::Tensor indices
+);
+
+at::Tensor sparse_dense_mm_kernel(
+ at::Tensor sparse_A,
+ at::Tensor indices,
+ at::Tensor dense_B,
+ int A_num_block
+);
+
+at::Tensor reduce_sum_kernel(
+ at::Tensor sparse_A,
+ at::Tensor indices,
+ int A_num_block,
+ int B_num_block
+);
+
+at::Tensor scatter_kernel(
+ at::Tensor dense_A,
+ at::Tensor indices,
+ int B_num_block
+);
diff --git a/torch-ext/cuda_launch.h b/torch-ext/cuda_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a8950a657c50ff70351eb43e4862f30e49f36e4
--- /dev/null
+++ b/torch-ext/cuda_launch.h
@@ -0,0 +1,39 @@
+#include
+#include
+#include
+
+#define min(a, b) ((a)<(b)?(a):(b))
+#define max(a, b) ((a)>(b)?(a):(b))
+
+std::vector index_max_kernel(
+ at::Tensor index_vals,
+ at::Tensor indices,
+ int A_num_block,
+ int B_num_block
+);
+
+at::Tensor mm_to_sparse_kernel(
+ at::Tensor dense_A,
+ at::Tensor dense_B,
+ at::Tensor indices
+);
+
+at::Tensor sparse_dense_mm_kernel(
+ at::Tensor sparse_A,
+ at::Tensor indices,
+ at::Tensor dense_B,
+ int A_num_block
+);
+
+at::Tensor reduce_sum_kernel(
+ at::Tensor sparse_A,
+ at::Tensor indices,
+ int A_num_block,
+ int B_num_block
+);
+
+at::Tensor scatter_kernel(
+ at::Tensor dense_A,
+ at::Tensor indices,
+ int B_num_block
+);
diff --git a/torch-ext/mra/__init__.py b/torch-ext/mra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86daaf874c808b8b828b43b4ee8a6b292323d336
--- /dev/null
+++ b/torch-ext/mra/__init__.py
@@ -0,0 +1,25 @@
+from ._ops import ops
+import torch
+
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
+
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+ return ops.scatter(dense_A, indices, B_num_block)
+
+__all__ = [
+ "index_max",
+ "mm_to_sparse",
+ "sparse_dense_mm",
+ "reduce_sum",
+ "scatter",
+]
\ No newline at end of file
diff --git a/torch-ext/torch_binding.cpp b/torch-ext/torch_binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bc885ced71edcbe2f3bbc900f4d56b11ee423841
--- /dev/null
+++ b/torch-ext/torch_binding.cpp
@@ -0,0 +1,92 @@
+#include
+#include
+#include
+#include
+
+#include "registration.h"
+#include "cuda_launch.h"
+
+std::vector index_max(
+ at::Tensor index_vals,
+ at::Tensor indices,
+ int64_t A_num_block,
+ int64_t B_num_block
+) {
+ return index_max_kernel(
+ index_vals,
+ indices,
+ static_cast(A_num_block),
+ static_cast(B_num_block)
+ );
+}
+
+at::Tensor mm_to_sparse(
+ at::Tensor dense_A,
+ at::Tensor dense_B,
+ at::Tensor indices
+) {
+ return mm_to_sparse_kernel(
+ dense_A,
+ dense_B,
+ indices
+ );
+}
+
+at::Tensor sparse_dense_mm(
+ at::Tensor sparse_A,
+ at::Tensor indices,
+ at::Tensor dense_B,
+ int64_t A_num_block
+) {
+ return sparse_dense_mm_kernel(
+ sparse_A,
+ indices,
+ dense_B,
+ static_cast(A_num_block)
+ );
+}
+
+at::Tensor reduce_sum(
+ at::Tensor sparse_A,
+ at::Tensor indices,
+ int64_t A_num_block,
+ int64_t B_num_block
+) {
+ return reduce_sum_kernel(
+ sparse_A,
+ indices,
+ static_cast(A_num_block),
+ static_cast(B_num_block)
+ );
+}
+
+at::Tensor scatter(
+ at::Tensor dense_A,
+ at::Tensor indices,
+ int64_t B_num_block
+) {
+ return scatter_kernel(
+ dense_A,
+ indices,
+ static_cast(B_num_block)
+ );
+}
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+ ops.def("index_max(Tensor index_vals, Tensor indices, int A_num_block, int B_num_block) -> Tensor[]");
+ ops.impl("index_max", torch::kCUDA, &index_max);
+
+ ops.def("mm_to_sparse(Tensor dense_A, Tensor dense_B, Tensor indices) -> Tensor");
+ ops.impl("mm_to_sparse", torch::kCUDA, &mm_to_sparse);
+
+ ops.def("sparse_dense_mm(Tensor sparse_A, Tensor indices, Tensor dense_B, int A_num_block) -> Tensor");
+ ops.impl("sparse_dense_mm", torch::kCUDA, &sparse_dense_mm);
+
+ ops.def("reduce_sum(Tensor sparse_A, Tensor indices, int A_num_block, int B_num_block) -> Tensor");
+ ops.impl("reduce_sum", torch::kCUDA, &reduce_sum);
+
+ ops.def("scatter(Tensor dense_A, Tensor indices, int B_num_block) -> Tensor");
+ ops.impl("scatter", torch::kCUDA, &scatter);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME);
\ No newline at end of file