Update test_matmul.py to support UT of XPU

Browse files

Files changed (3) hide show

tests/conftest.py +13 -2
tests/test_matmul.py +13 -12
torch-ext/triton_kernels/swiglu.py +1 -1

tests/conftest.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pytest
 def pytest_addoption(parser):
     parser.addoption("--device", action="store", default="cuda")
@@ -12,8 +12,19 @@ def device(request):
 @pytest.fixture
 def fresh_knobs(monkeypatch):
     from triton._internal_testing import _fresh_knobs_impl
-    fresh_function, reset_function = _fresh_knobs_impl(monkeypatch)
     try:
         yield fresh_function()
     finally:

 import pytest
+import triton
 def pytest_addoption(parser):
     parser.addoption("--device", action="store", default="cuda")
 @pytest.fixture
 def fresh_knobs(monkeypatch):
+    try:
+        _ver_str = getattr(triton, "__version__", "0.0.0").split("+")[0]
+        _parts = _ver_str.split(".")
+        _ver_tuple = tuple(int(p) for p in (_parts + ["0", "0", "0"])[:3])
+    except Exception:
+        _ver_tuple = (0, 0, 0)
     from triton._internal_testing import _fresh_knobs_impl
+    if _ver_tuple > (3, 4, 0):
+        fresh_function, reset_function = _fresh_knobs_impl()
+    else:
+        fresh_function, reset_function = _fresh_knobs_impl(monkeypatch)
     try:
         yield fresh_function()
     finally:

tests/test_matmul.py CHANGED Viewed

@@ -20,7 +20,7 @@ from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_m
 # testing utilities
 from triton_kernels.testing import assert_close, compute_actual_scale
 # target-specific utilities
-from triton_kernels.target_info import is_hip, is_hip_cdna3, is_cuda, is_hip_cdna4
 # ---------------
 # initialize data
@@ -70,7 +70,7 @@ def init_compute_data(m, n, k, gindx, sindx, n_expts_tot, n_expts_act, n_expt_sh
     if mode == 'batched' or (not has_y_gammas) or (has_y_gammas and (gindx is not None) and act_dtype.itemsize >= 2):
         gs0 = None
         gs1 = None
-    if "float8" in str(weight_dtype) and torch.cuda.get_device_capability()[0] < 10:
         w = w.transpose(-1, -2).contiguous().transpose(-1, -2)
     return x, w, bias, gs0, gs1
@@ -291,14 +291,15 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
     if hbm_swizzling:
         if is_hip():
             pytest.skip("NYI. HBM swizzling just implemented for CUDA.")
-        if torch.cuda.get_device_capability()[0] < 9:
-            pytest.skip("NYI. Ampere swizzling.")
-        if torch.cuda.get_device_capability()[0] < 10:
-            if "mxfloat4" not in weight_dtype_str:
-                pytest.skip("NYI. Hopper swizzling just implemented for mxfp4.")
-            if k % 64 != 0 or n % 64 != 0:
-                # Automatic padding not implemented for Hopper swizzle
-                pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
     # launch metadata for batched / mx types may not work yet.
     test_launch_metadata = (mode == "ragged") and ("mx" not in weight_dtype_str)
@@ -306,7 +307,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
     torch.manual_seed(0)
     block_k = None
-    if is_persistent and weight_dtype_str.startswith("mx") and torch.cuda.get_device_capability()[0] < 10:
         # Override block_k for testing correctness. The default is temporarily 128 for
         # performance reasons which doesn't work with persistent matmul.
         # TODO: revisit when Triton is better for H100 + MXFP4
@@ -462,7 +463,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
     round_y = lambda y: (y / y_scale).to(act_dtype).to(torch.float32) * y_scale if sep_scatter else y
     ref_y = matmul_ogs_torch(x_ref, w_ref, bias_ref,  #
-                             rdata, gindx, sindx, round_x=round_x, round_y=round_y, gammas=gs1_ref)
     scale = lambda val, scal: val if scal is None else val / scal
     if n_expt_shards > 1:
         if do_scatter:

 # testing utilities
 from triton_kernels.testing import assert_close, compute_actual_scale
 # target-specific utilities
+from triton_kernels.target_info import is_hip, is_xpu, is_hip_cdna3, is_cuda, is_hip_cdna4
 # ---------------
 # initialize data
     if mode == 'batched' or (not has_y_gammas) or (has_y_gammas and (gindx is not None) and act_dtype.itemsize >= 2):
         gs0 = None
         gs1 = None
+    if is_cuda() and "float8" in str(weight_dtype) and torch.cuda.get_device_capability()[0] < 10:
         w = w.transpose(-1, -2).contiguous().transpose(-1, -2)
     return x, w, bias, gs0, gs1
     if hbm_swizzling:
         if is_hip():
             pytest.skip("NYI. HBM swizzling just implemented for CUDA.")
+        if is_cuda():
+            if torch.cuda.get_device_capability()[0] < 9:
+                pytest.skip("NYI. Ampere swizzling.")
+            if torch.cuda.get_device_capability()[0] < 10:
+                if "mxfloat4" not in weight_dtype_str:
+                    pytest.skip("NYI. Hopper swizzling just implemented for mxfp4.")
+                if k % 64 != 0 or n % 64 != 0:
+                    # Automatic padding not implemented for Hopper swizzle
+                    pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
     # launch metadata for batched / mx types may not work yet.
     test_launch_metadata = (mode == "ragged") and ("mx" not in weight_dtype_str)
     torch.manual_seed(0)
     block_k = None
+    if is_cuda() and is_persistent and weight_dtype_str.startswith("mx") and torch.cuda.get_device_capability()[0] < 10:
         # Override block_k for testing correctness. The default is temporarily 128 for
         # performance reasons which doesn't work with persistent matmul.
         # TODO: revisit when Triton is better for H100 + MXFP4
     round_y = lambda y: (y / y_scale).to(act_dtype).to(torch.float32) * y_scale if sep_scatter else y
     ref_y = matmul_ogs_torch(x_ref, w_ref, bias_ref,  #
+                             rdata, gindx, sindx, round_x=round_x, round_y=round_y, gammas=gs1_ref, device=device)
     scale = lambda val, scal: val if scal is None else val / scal
     if n_expt_shards > 1:
         if do_scatter:

torch-ext/triton_kernels/swiglu.py CHANGED Viewed

@@ -35,7 +35,7 @@ class SwiGLU(torch.autograd.Function):
         # optimization hyperparameters
         BLOCK_M, BLOCK_N = 32 // a.itemsize, 128
         num_warps = 4
-        kwargs = {'maxnreg': 64} if not target_info.is_hip() else {}
         # launch semi-persistent kernel
         N_BLOCKS = triton.cdiv(N // 2, BLOCK_N)
         num_sms = target_info.num_sms()

         # optimization hyperparameters
         BLOCK_M, BLOCK_N = 32 // a.itemsize, 128
         num_warps = 4
+        kwargs = {'maxnreg': 64} if not target_info.is_hip() and not target_info.is_xpu() else {}
         # launch semi-persistent kernel
         N_BLOCKS = triton.cdiv(N // 2, BLOCK_N)
         num_sms = target_info.num_sms()