github-actions[bot] commited on 13 days ago

Commit

6ec5093

1 Parent(s): 811726c

Add built binary [skip-build]

Files changed (41) hide show

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_b0230e7_dirty.abi3.so → _optimizer_811726c_dirty.abi3.so} +2 -2
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +5 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_b0230e7_dirty.abi3.so → _optimizer_811726c_dirty.abi3.so} +2 -2
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +5 -0
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/optimizer/{_optimizer_b0230e7_dirty.abi3.so → _optimizer_811726c_dirty.abi3.so} +2 -2
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +5 -0
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_b0230e7_dirty.abi3.so → _optimizer_811726c_dirty.abi3.so} +2 -2
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +5 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so +3 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_b0230e7_dirty.abi3.so +0 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +5 -0
build/torch29-cxx11-cu126-x86_64-linux/optimizer/__init__.py +5 -0
build/torch29-cxx11-cu126-x86_64-linux/optimizer/_ops.py +9 -0
build/torch29-cxx11-cu126-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so +3 -0
build/torch29-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch29-cxx11-cu126-x86_64-linux/optimizer/muon.py +1069 -0
build/torch29-cxx11-cu128-x86_64-linux/optimizer/__init__.py +5 -0
build/torch29-cxx11-cu128-x86_64-linux/optimizer/_ops.py +9 -0
build/torch29-cxx11-cu128-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so +3 -0
build/torch29-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch29-cxx11-cu128-x86_64-linux/optimizer/muon.py +1069 -0
build/torch29-cxx11-cu130-x86_64-linux/optimizer/__init__.py +5 -0
build/torch29-cxx11-cu130-x86_64-linux/optimizer/_ops.py +9 -0
build/torch29-cxx11-cu130-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so +3 -0
build/torch29-cxx11-cu130-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch29-cxx11-cu130-x86_64-linux/optimizer/muon.py +1069 -0
build/torch29-cxx11-rocm63-x86_64-linux/optimizer/__init__.py +5 -0
build/torch29-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +9 -0
build/torch29-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so +3 -0
build/torch29-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch29-cxx11-rocm63-x86_64-linux/optimizer/muon.py +1069 -0
build/torch29-cxx11-rocm64-x86_64-linux/optimizer/__init__.py +5 -0
build/torch29-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +9 -0
build/torch29-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so +3 -0
build/torch29-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py +128 -0
build/torch29-cxx11-rocm64-x86_64-linux/optimizer/muon.py +1069 -0

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b0230e7_dirty
-ops = torch.ops._optimizer_b0230e7_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b0230e7_dirty::{op_name}"

 import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_b0230e7_dirty.abi3.so → _optimizer_811726c_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69525fcbfbe640264f4d52c9843b395b17f1828d38e1eceb97cec6bf46b0d8d0
-size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:511199ac2ae46febc8aeeb96e843a748da7d6fdea4922572ccf27ee5eabe312d
+size 1816064

build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -606,6 +606,11 @@ class Muon(torch.optim.Optimizer):
         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b0230e7_dirty
-ops = torch.ops._optimizer_b0230e7_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b0230e7_dirty::{op_name}"

 import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_b0230e7_dirty.abi3.so → _optimizer_811726c_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:331cc0bc5ee469afdfe0fc590bf52910c118cd0cec62ccbf85778c12ae367a95
-size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:b3cdb515b6c56204224cc307b66d34fcee1cd5e27b4117197a71b784d34fadc5
+size 1871056

build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -606,6 +606,11 @@ class Muon(torch.optim.Optimizer):
         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b0230e7_dirty
-ops = torch.ops._optimizer_b0230e7_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b0230e7_dirty::{op_name}"

 import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/optimizer/{_optimizer_b0230e7_dirty.abi3.so → _optimizer_811726c_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6ba7ad9228edcce4bf49173562b0796f1657eb734ddd6e23ca773c153eefce2
-size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:b957f60eab442d3ff5a5525d16a1b4b71e8c6be32edb874d9a5681953c61f0c2
+size 1871056

build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -606,6 +606,11 @@ class Muon(torch.optim.Optimizer):
         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b0230e7_dirty
-ops = torch.ops._optimizer_b0230e7_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b0230e7_dirty::{op_name}"

 import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_b0230e7_dirty.abi3.so → _optimizer_811726c_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:649c9c1ca7360650167cc191e373b271a4138161ec40b1e881a87515f82a613f
-size 1750000

 version https://git-lfs.github.com/spec/v1
+oid sha256:898ff08457f77c2f6ef504c73570cc87c5c5fd9a144528dbf8af4c03ffc21049
+size 1749232

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -606,6 +606,11 @@ class Muon(torch.optim.Optimizer):
         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b0230e7_dirty
-ops = torch.ops._optimizer_b0230e7_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b0230e7_dirty::{op_name}"

 import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72d100180fd73094f7b1c6e765eb4a77f103ad392fdee571687cb0c66d304177
+size 1749320

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_b0230e7_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:42b60753dab0948f4009893fcf3a8b080ad00e0436cbdaf0995dc29ae066c0c7
-size 1750088

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -606,6 +606,11 @@ class Muon(torch.optim.Optimizer):
         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

         if p.placements == (Shard(dim=0), ):
             # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             # Case for HSDP

build/torch29-cxx11-cu126-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch29-cxx11-cu126-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch29-cxx11-cu126-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87c8e75ead1c831dabfce1abbd7c100aa72c9b2988dfc0e1554216ca8005267c
+size 1816064

build/torch29-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch29-cxx11-cu126-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,1069 @@

+import logging
+import math
+import types
+from dataclasses import dataclass
+from typing import List, Optional, Union, cast
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
+logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
+    computed_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
+    process_group = None
+    qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
+@torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert any(
+            len(v) > 0 for v in per_dst
+        ), "At least one destination rank must receive a sharded tensor"
+        # list[list[Tensor]] -> list[Tensor]
+        per_dst = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst, dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
+@torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+@torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        lengths = [len(v) for v in per_dst]
+        if all(l > 0 for l in lengths):
+            assert all(
+                l == lengths[0] for l in lengths
+            ), "All destination ranks must have the same number of sharded tensor"
+            # list[list[Tensor]] -> list[Tensor]
+            per_dst = [t for dst in per_dst for t in dst]
+            send_buf = torch.cat(per_dst, dim=0)
+        else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
+def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
+    """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 },
+                 overlap_step=5):
+        defaults = dict(
+            lr=lr,
+            weight_decay=weight_decay,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+            none_grad=none_grad,
+            use_muon=True,
+        )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
+        self.rank = None
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+        self.clip_config = clip_config
+        self.overlap_step = overlap_step
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def get_shard_mesh(self, p):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
+            # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            process_group = p.device_mesh.get_group(mesh_dim=1)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if self.rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
+        round_robin = 0
+        mesh = None
+        shard_mesh = None
+        process_group = None
+        for n, p in zip(ordered_names, ordered_params):
+            if mesh is None:
+                mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p)
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
+            param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
+            round_robin = (round_robin + 1) % len(shard_mesh)
+        return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
+        # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                             steps=group["ns_steps"])
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
+        if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
+                           self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
+    def step(self, closure=None, qk_logits=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                names = group["names"]
+                param_dtensors = []
+                param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                            name_tensors.append(n)
+                        else:
+                            param_dtensors.append(p)
+                            name_dtensors.append(n)
+                    elif isinstance(p.data, torch.Tensor):
+                        param_tensors.append(p)
+                        name_tensors.append(n)
+                    else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        name_dtensors,
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+                if len(param_tensors) > 0:
+                    self.base(
+                        name_tensors,
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
+                    if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
+        return loss

build/torch29-cxx11-cu128-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch29-cxx11-cu128-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch29-cxx11-cu128-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab1875be65811d88c407f36077aced58056a4feeb9946d7cd40ec55c7e1025c8
+size 1871056

build/torch29-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch29-cxx11-cu128-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,1069 @@

+import logging
+import math
+import types
+from dataclasses import dataclass
+from typing import List, Optional, Union, cast
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
+logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
+    computed_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
+    process_group = None
+    qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
+@torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert any(
+            len(v) > 0 for v in per_dst
+        ), "At least one destination rank must receive a sharded tensor"
+        # list[list[Tensor]] -> list[Tensor]
+        per_dst = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst, dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
+@torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+@torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        lengths = [len(v) for v in per_dst]
+        if all(l > 0 for l in lengths):
+            assert all(
+                l == lengths[0] for l in lengths
+            ), "All destination ranks must have the same number of sharded tensor"
+            # list[list[Tensor]] -> list[Tensor]
+            per_dst = [t for dst in per_dst for t in dst]
+            send_buf = torch.cat(per_dst, dim=0)
+        else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
+def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
+    """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 },
+                 overlap_step=5):
+        defaults = dict(
+            lr=lr,
+            weight_decay=weight_decay,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+            none_grad=none_grad,
+            use_muon=True,
+        )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
+        self.rank = None
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+        self.clip_config = clip_config
+        self.overlap_step = overlap_step
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def get_shard_mesh(self, p):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
+            # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            process_group = p.device_mesh.get_group(mesh_dim=1)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if self.rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
+        round_robin = 0
+        mesh = None
+        shard_mesh = None
+        process_group = None
+        for n, p in zip(ordered_names, ordered_params):
+            if mesh is None:
+                mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p)
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
+            param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
+            round_robin = (round_robin + 1) % len(shard_mesh)
+        return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
+        # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                             steps=group["ns_steps"])
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
+        if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
+                           self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
+    def step(self, closure=None, qk_logits=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                names = group["names"]
+                param_dtensors = []
+                param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                            name_tensors.append(n)
+                        else:
+                            param_dtensors.append(p)
+                            name_dtensors.append(n)
+                    elif isinstance(p.data, torch.Tensor):
+                        param_tensors.append(p)
+                        name_tensors.append(n)
+                    else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        name_dtensors,
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+                if len(param_tensors) > 0:
+                    self.base(
+                        name_tensors,
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
+                    if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
+        return loss

build/torch29-cxx11-cu130-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch29-cxx11-cu130-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch29-cxx11-cu130-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52a744cf30c60fe1e8fc35ebb0d3421d679bb2047fbb4602846bd6902cfa9e52
+size 1872152

build/torch29-cxx11-cu130-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch29-cxx11-cu130-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,1069 @@

+import logging
+import math
+import types
+from dataclasses import dataclass
+from typing import List, Optional, Union, cast
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
+logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
+    computed_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
+    process_group = None
+    qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
+@torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert any(
+            len(v) > 0 for v in per_dst
+        ), "At least one destination rank must receive a sharded tensor"
+        # list[list[Tensor]] -> list[Tensor]
+        per_dst = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst, dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
+@torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+@torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        lengths = [len(v) for v in per_dst]
+        if all(l > 0 for l in lengths):
+            assert all(
+                l == lengths[0] for l in lengths
+            ), "All destination ranks must have the same number of sharded tensor"
+            # list[list[Tensor]] -> list[Tensor]
+            per_dst = [t for dst in per_dst for t in dst]
+            send_buf = torch.cat(per_dst, dim=0)
+        else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
+def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
+    """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 },
+                 overlap_step=5):
+        defaults = dict(
+            lr=lr,
+            weight_decay=weight_decay,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+            none_grad=none_grad,
+            use_muon=True,
+        )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
+        self.rank = None
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+        self.clip_config = clip_config
+        self.overlap_step = overlap_step
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def get_shard_mesh(self, p):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
+            # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            process_group = p.device_mesh.get_group(mesh_dim=1)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if self.rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
+        round_robin = 0
+        mesh = None
+        shard_mesh = None
+        process_group = None
+        for n, p in zip(ordered_names, ordered_params):
+            if mesh is None:
+                mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p)
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
+            param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
+            round_robin = (round_robin + 1) % len(shard_mesh)
+        return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
+        # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                             steps=group["ns_steps"])
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
+        if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
+                           self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
+    def step(self, closure=None, qk_logits=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                names = group["names"]
+                param_dtensors = []
+                param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                            name_tensors.append(n)
+                        else:
+                            param_dtensors.append(p)
+                            name_dtensors.append(n)
+                    elif isinstance(p.data, torch.Tensor):
+                        param_tensors.append(p)
+                        name_tensors.append(n)
+                    else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        name_dtensors,
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+                if len(param_tensors) > 0:
+                    self.base(
+                        name_tensors,
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
+                    if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
+        return loss

build/torch29-cxx11-rocm63-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch29-cxx11-rocm63-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch29-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0661740cd0f97ca56ef83979c5a5fa059bcba411148f89d836e9305065578e73
+size 1749264

build/torch29-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch29-cxx11-rocm63-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,1069 @@

+import logging
+import math
+import types
+from dataclasses import dataclass
+from typing import List, Optional, Union, cast
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
+logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
+    computed_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
+    process_group = None
+    qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
+@torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert any(
+            len(v) > 0 for v in per_dst
+        ), "At least one destination rank must receive a sharded tensor"
+        # list[list[Tensor]] -> list[Tensor]
+        per_dst = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst, dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
+@torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+@torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        lengths = [len(v) for v in per_dst]
+        if all(l > 0 for l in lengths):
+            assert all(
+                l == lengths[0] for l in lengths
+            ), "All destination ranks must have the same number of sharded tensor"
+            # list[list[Tensor]] -> list[Tensor]
+            per_dst = [t for dst in per_dst for t in dst]
+            send_buf = torch.cat(per_dst, dim=0)
+        else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
+def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
+    """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 },
+                 overlap_step=5):
+        defaults = dict(
+            lr=lr,
+            weight_decay=weight_decay,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+            none_grad=none_grad,
+            use_muon=True,
+        )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
+        self.rank = None
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+        self.clip_config = clip_config
+        self.overlap_step = overlap_step
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def get_shard_mesh(self, p):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
+            # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            process_group = p.device_mesh.get_group(mesh_dim=1)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if self.rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
+        round_robin = 0
+        mesh = None
+        shard_mesh = None
+        process_group = None
+        for n, p in zip(ordered_names, ordered_params):
+            if mesh is None:
+                mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p)
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
+            param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
+            round_robin = (round_robin + 1) % len(shard_mesh)
+        return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
+        # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                             steps=group["ns_steps"])
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
+        if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
+                           self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
+    def step(self, closure=None, qk_logits=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                names = group["names"]
+                param_dtensors = []
+                param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                            name_tensors.append(n)
+                        else:
+                            param_dtensors.append(p)
+                            name_dtensors.append(n)
+                    elif isinstance(p.data, torch.Tensor):
+                        param_tensors.append(p)
+                        name_tensors.append(n)
+                    else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        name_dtensors,
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+                if len(param_tensors) > 0:
+                    self.base(
+                        name_tensors,
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
+                    if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
+        return loss

build/torch29-cxx11-rocm64-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch29-cxx11-rocm64-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_811726c_dirty
+ops = torch.ops._optimizer_811726c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_811726c_dirty::{op_name}"

build/torch29-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_811726c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08b55491319446b12d0d890926506639640414edcba945e0f71afef0fac369d5
+size 1749352

build/torch29-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# MIT License
+#
+# Copyright (c) 2025 Tianyang Lin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

build/torch29-cxx11-rocm64-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,1069 @@

+import logging
+import math
+import types
+from dataclasses import dataclass
+from typing import List, Optional, Union, cast
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
+logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
+# This code snippet is a modified version adapted from the following GitHub repositories:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
+@torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
+    computed_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
+    process_group = None
+    qk_clip_state = None
+def split_elems_for_src(param, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
+@torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate gathered_grad buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
+    """
+    All2all gathers shards so each owner rank reconstructs its full gradient
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Construct sending buffers
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert any(
+            len(v) > 0 for v in per_dst
+        ), "At least one destination rank must receive a sharded tensor"
+        # list[list[Tensor]] -> list[Tensor]
+        per_dst = [t for dst in per_dst for t in dst]
+        send_buf = torch.cat(per_dst, dim=0)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owned_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
+@torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+@torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
+    """
+    Pre-allocate scattered_u buffer on compute_stream
+    before launching all2all gather
+    """
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
+    """
+    All2all scatters full gradients to all ranks
+    """
+    with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owned_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Construct sending buffer
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owned_params:
+            for p in owned_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        lengths = [len(v) for v in per_dst]
+        if all(l > 0 for l in lengths):
+            assert all(
+                l == lengths[0] for l in lengths
+            ), "All destination ranks must have the same number of sharded tensor"
+            # list[list[Tensor]] -> list[Tensor]
+            per_dst = [t for dst in per_dst for t in dst]
+            send_buf = torch.cat(per_dst, dim=0)
+        else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
+        # Compute receive sizes and allocate receiving buffers
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
+        #All2All
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
+        )
+        # Copy to pre-allocated scattered_u buffer from the received buffer
+        #
+        #                  recv_buf (num ranks = 3, local_rank = 0)
+        #
+        #      From rank 0        From rank 1       From rank 2
+        # | p1_0, p2_0, p3_0 |      p4_0       |    p5_0, p6_0    |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # src(0) :  p1_0 -> p2_0 -> p3_0
+        # src(1) :  p4_0
+        # src(2) :  p5_0 -> p6_0
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
+def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
+    return [
+        {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
+        },
+        {
+            "params": non_muon_params,
+            "use_muon": False,
+        },
+    ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
+        overlap_step : How many all2all gather, compute operations are launched in advance
+                       before the corresponding all2all scatter steps begin.
+                       A higher overlap_step increases memory usage but can improve
+                       performance by overlapping communication.
+                       Parallel muon only.
+    """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 },
+                 overlap_step=5):
+        defaults = dict(
+            lr=lr,
+            weight_decay=weight_decay,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+            none_grad=none_grad,
+            use_muon=True,
+        )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
+        self.rank = None
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+        self.clip_config = clip_config
+        self.overlap_step = overlap_step
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def get_shard_mesh(self, p):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
+            # Case for FSDP
+            process_group = p.device_mesh.get_group(mesh_dim=0)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            process_group = p.device_mesh.get_group(mesh_dim=1)
+            if self.rank is None:
+                self.rank = dist.get_rank(group=process_group)
+            else:
+                assert self.rank == dist.get_rank(group=process_group)
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if self.rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
+        round_robin = 0
+        mesh = None
+        shard_mesh = None
+        process_group = None
+        for n, p in zip(ordered_names, ordered_params):
+            if mesh is None:
+                mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p)
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            num_ranks = dist.get_world_size(group=process_group)
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(
+                p)].worker_rank = shard_mesh[round_robin].item() % num_ranks
+            param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
+            round_robin = (round_robin + 1) % len(shard_mesh)
+        return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
+        # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
+                                             steps=group["ns_steps"])
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
+        if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
+        # apply weight decay
+        p.data.mul_(1 - lr * weight_decay)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
+        assert self.rank is not None
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
+                           self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        overlap_step = self.overlap_step
+        for i in range(0, overlap_step):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + overlap_step * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + overlap_step * chunk_size, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
+    def step(self, closure=None, qk_logits=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                names = group["names"]
+                param_dtensors = []
+                param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                            name_tensors.append(n)
+                        else:
+                            param_dtensors.append(p)
+                            name_dtensors.append(n)
+                    elif isinstance(p.data, torch.Tensor):
+                        param_tensors.append(p)
+                        name_tensors.append(n)
+                    else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        name_dtensors,
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+                if len(param_tensors) > 0:
+                    self.base(
+                        name_tensors,
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                        qk_logits=qk_logits,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
+                    if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
+        return loss