danieldk HF Staff commited on 4 days ago

Commit

ff86389

verified ·

1 Parent(s): 38c7386

Build uploaded using `kernels` (batch 8/10).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_reorder.h +111 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_tensor.h +541 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_tensor_planar_complex.h +591 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_uncompress.h +157 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/index_sequence.h +38 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/mixed_dtype_utils.hpp +472 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/packed_stride.hpp +570 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/print_error.hpp +341 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/detail/inner_product.h +135 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h +94 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/convolution.h +1549 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm.h +385 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm_complex.h +350 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h +311 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gett.hpp +146 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/gemm.h +162 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h +168 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +159 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h +355 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_compare.h +250 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_fill.h +2075 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +142 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_reduce.h +514 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_relu.h +141 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/thread/gemm.h +186 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/conv.hpp +782 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/convolution.h +802 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/error_metrics.h +66 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm.h +531 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm_complex.h +210 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h +228 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gett.hpp +916 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/rank_2k.h +261 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h +318 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/rank_k_complex.h +234 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/symm.h +285 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/symm_complex.h +319 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_compare.h +616 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp +101 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_copy.h +256 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h +341 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_fill.h +1718 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp +432 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_foreach.h +134 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_norm.h +42 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_reduce.h +203 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp +203 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/trmm.h +215 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/trmm_complex.h +262 -0

.gitattributes CHANGED Viewed

@@ -16,3 +16,4 @@ build/torch210-cxx11-cu128-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=
 build/torch210-cxx11-cu130-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu128-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text

 build/torch210-cxx11-cu130-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu128-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_reorder.h ADDED Viewed

	@@ -0,0 +1,111 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief reorder data from the host side
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/gemm.h"
+namespace cutlass {
+/// This is needed for the interleaved integer tensor core kernels.  The purpose
+/// is to use skip the shared memory part in the epilogue.
+template <int Interleaved, typename Element, typename Layout>
+void reorder_column(TensorRef<Element, Layout> dest,
+                    TensorRef<Element, Layout> src,
+                    cutlass::gemm::GemmCoord problem_size) {
+  const int InstructionShapeCol = 8;
+  // 4 threads per Quad
+  const int ElementsPerThread = InstructionShapeCol / 4;
+  // 4 threads per Quad
+  const int ReorderedElementsPerThread =
+      Interleaved / 4;
+  for (int n = 0; n < problem_size.n(); n++) {
+    for (int k = 0; k < problem_size.k(); k++) {
+      dest.at({k, (n / Interleaved) * Interleaved +
+                      ((n % ReorderedElementsPerThread) / ElementsPerThread) *
+                          InstructionShapeCol +
+                      ((n % Interleaved) / ReorderedElementsPerThread) *
+                          ElementsPerThread +
+                      (n % ElementsPerThread)}) = src.at({k, n});
+    }
+  }
+}
+template <int ColumnInterleaved, int LayoutInterleaved = ColumnInterleaved, typename Element, typename Layout>
+void reorder_convK(TensorRef<Element, Layout> dest,
+                    TensorRef<Element, Layout> src,
+                    cutlass::gemm::GemmCoord problem_size) {
+    TensorRef<Element, layout::RowMajorInterleaved<LayoutInterleaved>> mappedDest(dest.data(), dest.stride(0));
+    TensorRef<Element, layout::RowMajorInterleaved<LayoutInterleaved>> mappedSrc(src.data(), src.stride(0));
+    reorder_column<ColumnInterleaved>(
+        mappedDest, mappedSrc, problem_size);
+}
+/// This is needed for the sparse tensor core kernels.  The purpose
+/// is to use ldmatrix to load from shared memory to the register file.
+template <typename Element, typename LayoutDest, typename LayoutSrc>
+void reorder_meta(TensorRef<Element, LayoutDest> dest,
+                  TensorRef<Element, LayoutSrc> src,
+                  cutlass::gemm::GemmCoord problem_size) {
+  for (int m = 0; m < problem_size.m(); m++) {
+    for (int k = 0; k < problem_size.k(); k++) {
+      // First reorder the rows.
+      int group = (sizeof(Element) == 2) ? 32 : 16;
+      int interweave = (sizeof(Element) == 2) ? 4 : 2;
+      int dest_row = m / group * group + (m % 8) * interweave + (m % group) / 8;
+      int dest_col = k;
+      // Next swizzle the 2x2 blocks from Z to N.
+      if (((dest_row % 2) == 0) && ((dest_col % 2) == 1)) {
+        ++dest_row;
+        --dest_col;
+      } else if (((dest_row % 2) == 1) && ((dest_col % 2) == 0)) {
+        --dest_row;
+        ++dest_col;
+      }
+      dest.at({dest_row, dest_col}) = src.at({m, k});
+    }
+  }
+}
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_tensor.h ADDED Viewed

	@@ -0,0 +1,541 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+/*! \file
+  \brief HostTensor contributes management for both host and device memory.
+  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
+  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
+  for CUDA memcpy operations.
+  Call {host, device}_{data, ref, view}() for accessing host or device memory.
+  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
+*/
+#include <vector>
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/fast_math.h"
+#include "device_memory.h"
+namespace cutlass {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Host tensor
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class HostTensor {
+public:
+  /// Data type of individual access
+  using Element = Element_;
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+  /// Index type
+  using Index = typename Layout::Index;
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+  /// Tensor reference to device memory
+  using TensorRef = TensorRef<Element, Layout>;
+  /// Tensor reference to constant device memory
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+  /// Tensor reference to device memory
+  using TensorView = TensorView<Element, Layout>;
+  /// Tensor reference to constant device memory
+  using ConstTensorView = typename TensorView::ConstTensorView;
+  /// Reference to element in tensor
+  using Reference = typename TensorRef::Reference;
+  /// Constant reference to element in tensor
+  using ConstReference = typename ConstTensorRef::Reference;
+private:
+  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
+                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
+                                      Element, uint8_t>>;
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
+  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
+  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
+  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+  //
+  // Data members
+  //
+  /// Extent of tensor in logical dimensions
+  TensorCoord extent_;
+  /// Layout object
+  Layout layout_;
+  /// Host-side memory allocation
+  std::vector<StorageUnit> host_;
+  /// Device-side memory
+  device_memory::allocation<StorageUnit> device_;
+  /// number of containers
+  size_t count_to_container_storage_unit_count(size_t count) {
+    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
+  }
+public:
+  //
+  // Device and Host Methods
+  //
+  /// Default constructor
+  HostTensor() {}
+  /// Constructs a tensor given an extent. Assumes a packed layout
+  HostTensor(
+    TensorCoord const &extent,
+    bool device_backed = true
+  ) {
+    this->reset(extent, Layout::packed(extent), device_backed);
+  }
+  /// Constructs a tensor given an extent and layout
+  HostTensor(
+    TensorCoord const &extent,
+    Layout const &layout,
+    bool device_backed = true
+  ) {
+    this->reset(extent, layout, device_backed);
+  }
+  ~HostTensor() { }
+  /// Clears the HostTensor allocation to size/capacity = 0
+  void reset() {
+    extent_ = TensorCoord();
+    layout_ = Layout::packed(extent_);
+    host_.clear();
+    device_.reset();
+  }
+  /// Resizes internal memory allocations without affecting layout or extent
+  void reserve(
+    size_t count,                                        ///< size of tensor in elements
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
+#endif
+    device_.reset();
+    host_.clear();
+    size_t count_container = count_to_container_storage_unit_count(count);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
+#endif
+    host_.resize(count_container);
+    // Allocate memory
+    StorageUnit* device_memory = nullptr;
+    if (device_backed_) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
+#endif
+      device_memory = device_memory::allocate<StorageUnit>(count_container);
+    }
+    device_.reset(device_memory, device_backed_ ? count_container : 0);
+  }
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
+    extent_ = extent;
+    layout_ = layout;
+    reserve(size_t(layout_.capacity(extent_)), device_backed_);
+  }
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout. Assumes a packed tensor configuration.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
+    reset(extent, Layout::packed(extent), device_backed_);
+  }
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset().
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
+    extent_ = extent;
+    layout_ = layout;
+    LongIndex new_size = size_t(layout_.capacity(extent_));
+    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
+    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
+      reserve(new_size, device_backed_);
+    }
+  }
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
+    resize(extent, Layout::packed(extent), device_backed_);
+  }
+  /// Returns the logical number of elements stored in the host tensor
+  size_t size() const {
+    return layout_.capacity(extent_);
+  }
+  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
+  LongIndex capacity() const {
+    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
+  }
+  /// Gets pointer to host data
+  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
+  /// Gets pointer to host data with a pointer offset
+  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+  /// Gets a reference to an element in host memory
+  Reference host_data(LongIndex idx) {
+    return ReferenceFactory<Element>::get(host_data(), idx);
+  }
+  /// Gets pointer to host data
+  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
+  /// Gets pointer to host data with a pointer offset
+  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
+  /// Gets a constant reference to an element in host memory
+  ConstReference host_data(LongIndex idx) const {
+    return ReferenceFactory<Element const>::get(host_data(), idx);
+  }
+  /// Gets pointer to device data
+  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
+  /// Gets pointer to device data
+  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
+  /// Gets pointer to device data with a pointer offset
+  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+  /// Gets pointer to device data with a pointer offset
+  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
+  /// Accesses the tensor reference pointing to data
+  TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
+  /// Accesses the tensor reference pointing to data
+  TensorRef device_ref(LongIndex ptr_element_offset=0) {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
+  }
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
+  }
+  /// Accesses the tensor reference pointing to data
+  TensorView host_view(LongIndex ptr_element_offset=0) {
+    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  TensorView device_view(LongIndex ptr_element_offset=0) {
+    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
+  }
+  /// Returns true if device memory is allocated
+  bool device_backed() const {
+    return (device_.get() == nullptr) ? false : true;
+  }
+  /// Returns the layout object
+  Layout & layout() {
+    return layout_;
+  }
+  /// Returns the layout object
+  Layout layout() const {
+    return layout_;
+  }
+  /// Returns the layout object's stride vector
+  Stride stride() const {
+    return layout_.stride();
+  }
+  /// Returns the layout object's stride vector
+  Stride & stride() {
+    return layout_.stride();
+  }
+  /// Returns the layout object's stride in a given physical dimension
+  LongIndex stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+  /// Returns the layout object's stride in a given physical dimension
+  LongIndex & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+  /// Computes the offset of an index from the origin of the tensor
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+  /// Returns a reference to the element at the logical Coord in host memory
+  Reference at(TensorCoord const& coord) {
+    return host_data(offset(coord));
+  }
+  /// Returns a const reference to the element at the logical Coord in host memory
+  ConstReference at(TensorCoord const& coord) const {
+    return host_data(offset(coord));
+  }
+  /// Returns the extent of the tensor
+  TensorCoord extent() const {
+    return extent_;
+  }
+  /// Returns the extent of the tensor
+  TensorCoord & extent() {
+    return extent_;
+  }
+  /// Copies data from device to host
+  void sync_host() {
+    if (device_backed()) {
+      device_memory::copy_to_host(
+          host_.data(), device_.get(), device_.size());
+    }
+  }
+  /// Copies data from host to device
+  void sync_device() {
+    if (device_backed()) {
+      device_memory::copy_to_device(
+          device_.get(), host_.data(), host_.size());
+    }
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_host(
+    Element const* ptr_device,        ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_host(
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_device(
+    Element const* ptr_device,        ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_device_to_device(
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_device(
+    Element const* ptr_host,          ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_device(
+      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_host(
+    Element const* ptr_host,          ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_host_to_host(
+      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_host(
+    Element * ptr_host,               ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_host(
+      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_device(
+    Element * ptr_device,             ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_device_to_device(
+      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_device(
+    Element * ptr_device,             ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_to_device(
+      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_host(
+    Element * ptr_host,               ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    size_t container_count = count_to_container_storage_unit_count(count);
+    device_memory::copy_host_to_host(
+      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_tensor_planar_complex.h ADDED Viewed

	@@ -0,0 +1,591 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+/*! \file
+  \brief HostTensor contributes management for both host and device memory.
+  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
+  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
+  for CUDA memcpy operations.
+  Call {host, device}_{data, ref, view}() for accessing host or device memory.
+  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
+*/
+#include <vector>
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref_planar_complex.h"
+#include "cutlass/tensor_view_planar_complex.h"
+#include "device_memory.h"
+namespace cutlass {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Host tensor
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class HostTensorPlanarComplex {
+public:
+  /// Data type of individual access
+  using Element = Element_;
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+  /// Index type
+  using Index = typename Layout::Index;
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+  /// Tensor reference to device memory
+  using TensorRef = TensorRefPlanarComplex<Element, Layout>;
+  /// Tensor reference to constant device memory
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+  /// Tensor reference to device memory
+  using TensorView = TensorViewPlanarComplex<Element, Layout>;
+  /// Tensor reference to constant device memory
+  using ConstTensorView = typename TensorView::ConstTensorView;
+  /// Reference to element in tensor
+  using Reference = typename TensorRef::Reference;
+  /// Constant reference to element in tensor
+  using ConstReference = typename ConstTensorRef::Reference;
+ private:
+  //
+  // Data members
+  //
+  /// Extent of tensor in logical dimensions
+  TensorCoord extent_;
+  /// Layout object
+  Layout layout_;
+  /// Host-side memory allocation
+  std::vector<Element> host_;
+  /// Device-side memory
+  device_memory::allocation<Element> device_;
+ public:
+  //
+  // Device and Host Methods
+  //
+  /// Default constructor
+  HostTensorPlanarComplex() {}
+  /// Constructs a tensor given an extent. Assumes a packed layout
+  HostTensorPlanarComplex(
+    TensorCoord const &extent,
+    bool device_backed = true
+  ) {
+    this->reset(extent, Layout::packed(extent), device_backed);
+  }
+  /// Constructs a tensor given an extent and layout
+  HostTensorPlanarComplex(
+    TensorCoord const &extent,
+    Layout const &layout,
+    bool device_backed = true
+  ) {
+    this->reset(extent, layout, device_backed);
+  }
+  ~HostTensorPlanarComplex() { }
+  /// Clears the HostTensor allocation to size/capacity = 0
+  void reset() {
+    extent_ = TensorCoord();
+    layout_ = Layout::packed(extent_);
+    host_.clear();
+    device_.reset();
+  }
+  /// Resizes internal memory allocations without affecting layout or extent
+  void reserve(
+    size_t count,                                        ///< size of tensor in elements
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
+    device_.reset();
+    host_.clear();
+    host_.resize(count * 2);
+    // Allocate memory
+    Element* device_memory = nullptr;
+    if (device_backed_) {
+      device_memory = device_memory::allocate<Element>(count * 2);
+    }
+    device_.reset(device_memory, device_backed_ ? count * 2 : 0);
+  }
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
+    extent_ = extent;
+    layout_ = layout;
+    reserve(size_t(layout_.capacity(extent_)), device_backed_);
+  }
+  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
+  /// extent and layout. Assumes a packed tensor configuration.
+  void reset(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
+    reset(extent, Layout::packed(extent), device_backed_);
+  }
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset().
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    Layout const &layout,                                ///< layout object of tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
+    extent_ = extent;
+    layout_ = layout;
+    LongIndex new_size = size_t(layout_.capacity(extent_));
+    if (static_cast<decltype(host_.size())>(new_size * 2) > host_.size()) {
+      reserve(new_size);
+    }
+  }
+  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
+  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
+  void resize(
+    TensorCoord const &extent,                           ///< extent of logical tensor
+    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
+    resize(extent, Layout::packed(extent), device_backed_);
+  }
+  /// Returns the number of elements stored in the host tensor
+  size_t size() const {
+    return host_.size() / 2;
+  }
+  /// Returns the logical capacity based on extent and layout. May differ from size().
+  LongIndex capacity() const {
+    return layout_.capacity(extent_);
+  }
+  /// Stride between real and imaginary parts
+  LongIndex imaginary_stride() const {
+    return host_.size() / 2;
+  }
+  /// Gets pointer to host data
+  Element * host_data() { return host_.data(); }
+  /// Gets pointer to host data imaginary part
+  Element * host_data_imag() { return host_.data() + imaginary_stride(); }
+  /// Gets pointer to host data with a pointer offset
+  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return host_data() + ptr_element_offset; }
+  /// Gets pointer to host data with a pointer offset
+  Element * host_data_imag_ptr_offset(LongIndex ptr_element_offset) { return host_data_imag() + ptr_element_offset; }
+  /// Gets a reference to an element in host memory
+  Reference host_data(LongIndex idx) {
+    return PlanarComplexReference<Element>(host_data() + idx, host_data_imag() + idx);
+  }
+  /// Gets pointer to host data
+  Element const * host_data() const { return host_.data(); }
+  /// Gets pointer to host data imaginary part
+  Element const * host_data_imag() const { return host_.data() + imaginary_stride(); }
+  /// Gets a constant reference to an element in host memory
+  ConstReference host_data(LongIndex idx) const {
+    return PlanarComplexReference<Element const>(host_data() + idx, host_data_imag() + idx);
+  }
+  /// Gets pointer to device data
+  Element * device_data() { return device_.get(); }
+  /// Gets pointer to device data with a pointer offset
+  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return device_.get() + ptr_element_offset; }
+  /// Gets pointer to device data
+  Element const * device_data() const { return device_.get(); }
+  /// Gets pointer to device data with a pointer offset
+  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return device_.get() + ptr_element_offset; }
+  /// Gets a pointer to the device data imaginary part
+  Element * device_data_imag() { return device_.get() + imaginary_stride(); }
+  /// Accesses the tensor reference pointing to data
+  TensorRef host_ref(LongIndex ptr_element_offset=0) {
+    return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
+  }
+  /// Returns a tensor reference to the real part of the tensor
+  cutlass::TensorRef<Element, Layout> host_ref_real() {
+    return cutlass::TensorRef<Element, Layout>(host_data(), layout_);
+  }
+  /// Returns a tensor reference to the real part of the tensor
+  cutlass::TensorRef<Element, Layout> host_ref_imag() {
+    return cutlass::TensorRef<Element, Layout>(host_data_ptr_offset(imaginary_stride()), layout_);
+  }
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const {
+    return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
+  }
+  /// Accesses the tensor reference pointing to data
+  TensorRef device_ref(LongIndex ptr_element_offset=0) {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
+  }
+  /// Accesses the tensor reference pointing to data
+  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
+    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
+  }
+  /// Returns a tensor reference to the real part of the tensor
+  cutlass::TensorRef<Element, Layout> device_ref_real() {
+    return cutlass::TensorRef<Element, Layout>(device_data(), layout_);
+  }
+  /// Returns a tensor reference to the real part of the tensor
+  cutlass::TensorRef<Element, Layout> device_ref_imag() {
+    return cutlass::TensorRef<Element, Layout>(device_data_ptr_offset(imaginary_stride()), layout_);
+  }
+  /// Accesses the tensor reference pointing to data
+  TensorView host_view(LongIndex ptr_element_offset=0) {
+    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride(), extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride(), extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  cutlass::TensorView<Element, Layout> host_view_real() {
+    return cutlass::TensorView<Element, Layout>(host_data(), layout_, extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  cutlass::TensorView<Element, Layout> host_view_imag() {
+    return cutlass::TensorView<Element, Layout>(host_data_ptr_offset(imaginary_stride()), layout_, extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  TensorView device_view(LongIndex ptr_element_offset=0) {
+    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride(), extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
+    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride(), extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  cutlass::TensorView<Element, Layout> device_view_real() {
+    return cutlass::TensorView<Element, Layout>(device_data(), layout_, extent_);
+  }
+  /// Accesses the tensor reference pointing to data
+  cutlass::TensorView<Element, Layout> device_view_imag() {
+    return cutlass::TensorView<Element, Layout>(device_data_ptr_offset(imaginary_stride()), layout_, extent_);
+  }
+  /// Returns true if device memory is allocated
+  bool device_backed() const {
+    return (device_.get() == nullptr) ? false : true;
+  }
+  /// Returns the layout object
+  Layout layout() const {
+    return layout_;
+  }
+  /// Returns the layout object's stride vector
+  Stride stride() const {
+    return layout_.stride();
+  }
+  /// Returns the layout object's stride in a given physical dimension
+  Index stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+  /// Computes the offset of an index from the origin of the tensor
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+  /// Returns a reference to the element at the logical Coord in host memory
+  Reference at(TensorCoord const& coord) {
+    return host_data(offset(coord));
+  }
+  /// Returns a const reference to the element at the logical Coord in host memory
+  ConstReference at(TensorCoord const& coord) const {
+    return host_data(offset(coord));
+  }
+  /// Returns the extent of the tensor
+  TensorCoord extent() const {
+    return extent_;
+  }
+  /// Returns the extent of the tensor
+  TensorCoord & extent() {
+    return extent_;
+  }
+  /// Copies data from device to host
+  void sync_host() {
+    if (device_backed()) {
+      device_memory::copy_to_host(
+          host_data(), device_data(), imaginary_stride() * 2);
+    }
+  }
+  /// Copies data from host to device
+  void sync_device() {
+    if (device_backed()) {
+      device_memory::copy_to_device(
+          device_data(), host_data(), imaginary_stride() * 2);
+    }
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_host(
+    Element const* ptr_device_real,   ///< source device memory
+    Element const* ptr_device_imag,   ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    device_memory::copy_to_host(
+      host_data(), ptr_device_real, count);
+    device_memory::copy_to_host(
+      host_data_imag(), ptr_device_imag, count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_device_to_device(
+    Element const* ptr_device_real,   ///< source device memory
+    Element const* ptr_device_imag,   ///< source device memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    device_memory::copy_device_to_device(
+      device_data(), ptr_device_real, count);
+    device_memory::copy_device_to_device(
+      device_data_imag(), ptr_device_imag, count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_device(
+    Element const* ptr_host_real,      ///< source host memory
+    Element const* ptr_host_imag,      ///< source host memory
+    LongIndex count = -1) {            ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    device_memory::copy_to_device(
+      device_data(), ptr_host_real, count);
+    device_memory::copy_to_device(
+      device_data_imag(), ptr_host_imag, count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_in_host_to_host(
+    Element const* ptr_host_real,     ///< source host memory
+    Element const* ptr_host_imag,     ///< source host memory
+    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    device_memory::copy_host_to_host(
+      host_data(), ptr_host_real, count);
+    device_memory::copy_host_to_host(
+      host_data_imag(), ptr_host_imag, count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_host(
+    Element * ptr_host_real,           ///< source device memory
+    Element * ptr_host_imag,           ///< source device memory
+    LongIndex count = -1) const {      ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    device_memory::copy_to_host(
+      ptr_host_real, device_data(), count);
+    device_memory::copy_to_host(
+      ptr_host_imag, device_data_imag(), count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_device_to_device(
+    Element * ptr_device_real,        ///< source device memory
+    Element * ptr_device_imag,        ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    device_memory::copy_device_to_device(
+      ptr_device_real, device_data(), count);
+    device_memory::copy_device_to_device(
+      ptr_device_imag, device_data_imag(), count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_device(
+    Element * ptr_device_real,        ///< source device memory
+    Element * ptr_device_imag,        ///< source device memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    device_memory::copy_to_device(
+      ptr_device_real, host_data(), count);
+    device_memory::copy_to_device(
+      ptr_device_imag, host_data_imag(), count);
+  }
+  /// Copy data from a caller-supplied device pointer into host memory.
+  void copy_out_host_to_host(
+    Element * ptr_host_real,          ///< source host memory
+    Element * ptr_host_imag,          ///< source host memory
+    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
+    if (count < 0) {
+      count = capacity();
+    }
+    else {
+      count = __NV_STD_MIN(capacity(), count);
+    }
+    device_memory::copy_host_to_host(
+      ptr_host_real, host_data(), count);
+    device_memory::copy_host_to_host(
+      ptr_host_imag, host_data_imag(), count);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_uncompress.h ADDED Viewed

	@@ -0,0 +1,157 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief uncompress sparse matrix from the host side
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/gemm.h"
+namespace cutlass {
+// uncompress sparse tensor core A matrix
+template <typename ElementA, typename LayoutA, typename ElementE,
+          typename LayoutE>
+void uncompress(TensorRef<ElementA, LayoutA> uncompressed_tensor_a,
+                TensorRef<ElementA, LayoutA> tensor_a,
+                TensorRef<ElementE, LayoutE> tensor_e, int row, int col) {
+  // How many uncompressed data we can get with ElementE meta data
+  int DecompressedElementsPerElementE =
+      256 / cutlass::sizeof_bits<ElementA>::value;
+  // Process 4bit meta data a time
+  int step;
+  // 1:2 or 2:4 or 4:8
+  int a, b;
+  if (cutlass::sizeof_bits<ElementA>::value == 4) {
+    step = 8;
+    a = 4;
+    b = 8;
+  } else if (cutlass::sizeof_bits<ElementA>::value == 8) {
+    step = 4;
+    a = 2;
+    b = 4;
+  } else if (cutlass::sizeof_bits<ElementA>::value == 16) {
+    step = 4;
+    a = 2;
+    b = 4;
+  } else if (cutlass::sizeof_bits<ElementA>::value == 32) {
+    step = 2;
+    a = 1;
+    b = 2;
+  }
+  int ElementsPerE = (cutlass::sizeof_bits<ElementA>::value == 4) ? 2 : 1;
+  for (int r = 0; r < row; ++r) {
+    for (int c = 0; c < (col / DecompressedElementsPerElementE); ++c) {
+      ElementE meta = tensor_e.at(MatrixCoord(r, c));
+      for (int i = 0; i < DecompressedElementsPerElementE; i += step) {
+        int e = (meta >> (i / step * 4)) & 0xf;
+        int idx0 = e & 0x3;
+        int idx1 = e >> 2;
+        if (a == 1) idx0 = idx0 / 2;
+        for (int ii = 0; ii < step; ii += ElementsPerE) {
+          int real_col =
+              c * DecompressedElementsPerElementE + i + ii;
+          int compressed_col = (real_col / b) * a;
+          if (ii == (idx0 * ElementsPerE)) {
+            uncompressed_tensor_a.at(MatrixCoord(r, real_col)) =
+                tensor_a.at(MatrixCoord(r, compressed_col));
+            if (ElementsPerE == 2)
+              uncompressed_tensor_a.at(MatrixCoord(r, real_col + 1)) =
+                  tensor_a.at(MatrixCoord(r, compressed_col + 1));
+          } else if ((ii == (idx1 * ElementsPerE)) && (a != 1)) {
+            uncompressed_tensor_a.at(MatrixCoord(r, real_col)) =
+                tensor_a.at(MatrixCoord(r, compressed_col + ElementsPerE));
+            if (ElementsPerE == 2)
+              uncompressed_tensor_a.at(MatrixCoord(r, real_col + 1)) =
+                  tensor_a.at(
+                      MatrixCoord(r, compressed_col + ElementsPerE + 1));
+          } else {
+            uncompressed_tensor_a.at(MatrixCoord(r, real_col)) =
+                ElementA(0);
+            if (ElementsPerE == 2)
+              uncompressed_tensor_a.at(MatrixCoord(r, real_col + 1)) =
+                  ElementA(0);
+          }
+        }
+      }
+    }
+  }
+}
+// uncompress ELL block sparse matrix
+template <typename ElementA, typename LayoutA,
+          typename ElementE, typename LayoutE>
+void uncompress_ell_block_sparse(
+                TensorRef<ElementA, LayoutA> uncompressed_tensor_a,
+                TensorRef<ElementA, LayoutA> tensor_a,
+                TensorRef<ElementE, LayoutE> ell_idx,
+                int rows, int cols,
+                int ell_num_cols, int ell_blocksize) {
+  for (int r = 0; r < rows / ell_blocksize; ++r) {
+    for (int c = 0; c < ell_num_cols / ell_blocksize; ++c) {
+      ElementE idx = ell_idx.at(MatrixCoord(r, c));
+      if (idx != -1) {
+        int row_begin = r * ell_blocksize;
+        int col_begin_real = idx * ell_blocksize;
+        int col_begin = c * ell_blocksize;
+        for (int i = 0; i < ell_blocksize; ++i) {
+          for (int j = 0; j < ell_blocksize; ++j) {
+            uncompressed_tensor_a.at(MatrixCoord(row_begin + i, col_begin_real + j)) =
+                tensor_a.at(
+                    MatrixCoord(row_begin + i, col_begin +j));
+          }
+        }
+      }
+    }
+  }
+}
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/index_sequence.h ADDED Viewed

	@@ -0,0 +1,38 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+// integer_sequence moved to cutlass/numeric_types.h

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/mixed_dtype_utils.hpp ADDED Viewed

	@@ -0,0 +1,472 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for mixed input data type kernels.
+*/
+#pragma once
+#include <cuda.h>
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/arch/mma_sm90.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cute/util/type_traits.hpp"
+namespace cutlass {
+#define CUDA_CHECK(status)                                              \
+  {                                                                     \
+    cudaError_t error = status;                                         \
+    if (error != cudaSuccess) {                                         \
+      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                << " at line: " << __LINE__ << std::endl;               \
+      exit(EXIT_FAILURE);                                               \
+    }                                                                   \
+  }
+template <
+  class QuantizedElement,
+  class DequantizedElement,
+  class OperandLayout,
+  class ElementScale,
+  class ElementZero,
+  class ScaleBroadCastLayout,
+  class ThrLayout>
+__global__ void dequantize_kernel(DequantizedElement* dq_buffer,
+                                  QuantizedElement const* q_buffer,
+                                  OperandLayout const operand_layout,
+                                  ElementScale const* scale_buffer,
+                                  ElementZero const* zero_buffer,
+                                  ScaleBroadCastLayout const broadcasted_scale_layout,
+                                  ThrLayout thr_layout) {
+  using namespace cute;
+  // Represent the full tensors to gmem elements.
+  // These are expected to have shape [MN, K, L]
+  cute::Tensor gmem_op_dq = cute::make_tensor(cute::make_gmem_ptr(dq_buffer), operand_layout);
+  cute::Tensor gmem_op_q  = cute::make_tensor(cute::make_gmem_ptr<QuantizedElement const>(q_buffer), operand_layout);
+  // While the scales are expected to have shape [MN, G, L] but with a stride to allow broadcasting
+  // It is expected that K % G == 0
+  cute::Tensor gmem_scale_broadcasted = cute::make_tensor(make_gmem_ptr(scale_buffer), broadcasted_scale_layout);
+  cute::Tensor gmem_zero_broadcasted = cute::make_tensor(make_gmem_ptr(zero_buffer), broadcasted_scale_layout);
+  // Assign 1 thread per element in the thread block
+  auto blk_shape = cute::make_shape(size<0>(thr_layout), _1{}, _1{}); //
+  auto blk_coord = cute::make_coord(_, blockIdx.x, blockIdx.y);  // (MN, K, L)
+  // Tile across the block
+  auto gOp_dq = cute::local_tile(gmem_op_dq, blk_shape, blk_coord);
+  auto gScale = cute::local_tile(gmem_scale_broadcasted, blk_shape, blk_coord);
+  auto gZero  = cute::local_tile(gmem_zero_broadcasted,  blk_shape, blk_coord);
+  auto gOp_q  = cute::local_tile(gmem_op_q, blk_shape, blk_coord);
+  auto tOpDq_gOpDq = cute::local_partition(gOp_dq, thr_layout, threadIdx.x);
+  auto tScale_gScale = cute::local_partition(gScale, thr_layout, threadIdx.x);
+  auto tZero_gZero = cute::local_partition(gZero, thr_layout, threadIdx.x);
+  auto tOpQ_gOpQ = cute::local_partition(gOp_q, thr_layout, threadIdx.x);
+  // Make a fragment of registers to hold gmem loads
+  cute::Tensor rmem_op_q = cute::make_fragment_like(tOpQ_gOpQ(_, _, _, 0));
+  cute::Tensor rmem_scale = cute::make_fragment_like(tScale_gScale(_, _, _, 0));
+  cute::Tensor rmem_zero = cute::make_fragment_like(tZero_gZero(_, _, _, 0));
+  cute::Tensor rmem_op_dq = cute::make_fragment_like(tOpDq_gOpDq(_, _, _, 0));
+  cute::Tensor rmem_op_scaled = cute::make_fragment_like<ElementScale>(rmem_op_dq);
+  cute::Tensor rmem_zero_buf = cute::make_fragment_like<ElementScale>(rmem_zero);
+  cute::Tensor pred_id = cute::make_identity_tensor(shape(operand_layout));
+  auto pred_blk_tile = cute::local_tile(pred_id, blk_shape, blk_coord);
+  auto pred_thr_partition = cute::local_partition(pred_blk_tile, thr_layout, threadIdx.x);
+  const auto num_iters = cute::size<3>(tOpDq_gOpDq);
+  for (int ii = 0; ii < num_iters; ++ii) {
+    const auto thread_offset = cute::get<0>(pred_thr_partition(0, 0, 0, ii));
+    if (thread_offset < cute::size<0>(operand_layout)) {
+      cute::copy(tOpQ_gOpQ(_, _, _, ii), rmem_op_q);
+      cute::copy(tScale_gScale(_, _, _, ii), rmem_scale);
+      cute::copy(tZero_gZero(_, _, _, ii), rmem_zero);
+      cute::transform(rmem_op_q, rmem_op_scaled, [] (const QuantizedElement& elt) { return ElementScale(elt); } );
+      cute::transform(rmem_zero, rmem_zero_buf, [] (const ElementZero& elt) { return ElementScale(elt); } );
+      cute::transform(rmem_op_scaled, rmem_scale, rmem_op_scaled, cute::multiplies{});
+      cute::transform(rmem_op_scaled, rmem_zero_buf, rmem_op_scaled, cute::plus{});
+      cute::transform(rmem_op_scaled, rmem_op_dq, [] (const ElementScale& elt) { return DequantizedElement(elt); } );
+      cute::copy(rmem_op_dq, tOpDq_gOpDq(_, _, _, ii));
+    }
+  }
+}
+template <
+  class QuantizedElement,
+  class DequantizedElement,
+  class OperandLayout,
+  class ElementScale,
+  class ElementZero,
+  class ScaleLayout>
+static void dequantize(DequantizedElement* dq_buffer,
+                       QuantizedElement const* q_buffer,
+                       OperandLayout const operand_layout,
+                       ElementScale const* scale_buffer,
+                       ElementZero const* zero_buffer,
+                       ScaleLayout const scale_layout,
+                       int const group_size,
+                       cudaStream_t &stream) {
+  using namespace cute;
+  constexpr int tpb = 128;
+  auto thr_layout = make_layout(make_shape(Int<tpb>{}));
+  const auto num_rows = get<0>(shape(operand_layout));
+  const auto gemm_k = get<1>(shape(operand_layout));   // [MN, K, L]
+  const auto batches = get<2>(shape(operand_layout));  // [MN, K, L]
+  const auto scale_k = get<1>(shape(scale_layout));    // [MN, Scale_K, L]
+  if (num_rows != size<0>(scale_layout)) {
+    std::cerr << "Invalid first dimension for scales. Must match first dim for weights."
+              << " But got shapes " << shape(operand_layout) << " " << shape(scale_layout)
+              << std::endl;
+    exit(-1);
+  }
+  const auto scale_stride0 = get<0>(stride(scale_layout));
+  const auto scale_stride1 = get<1>(stride(scale_layout));
+  const auto scale_stride2 = get<2>(stride(scale_layout));
+  auto scale_shape_bcast = make_shape(num_rows, make_shape(group_size, scale_k), batches);
+  auto scale_stride_bcast = make_stride(scale_stride0, make_stride(0, scale_stride1), scale_stride2);
+  auto scale_layout_bcast = make_layout(scale_shape_bcast, scale_stride_bcast);
+  const auto blocks_x = gemm_k;
+  const auto blocks_y = batches;
+  dim3 blocks(blocks_x, blocks_y, 1);
+  dequantize_kernel<<<blocks, tpb, 0, stream>>>(dq_buffer, q_buffer, operand_layout, scale_buffer, zero_buffer, scale_layout_bcast, thr_layout);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+template <typename T>
+class packed_scale_t {
+public:
+  static_assert(cute::is_same_v<T, cutlass::int8_t> ||
+                cute::is_same_v<T, cutlass::uint8_t> ||
+                cute::is_same_v<T, cutlass::float_e4m3_t> ||
+                cute::is_same_v<T, cutlass::float_e5m2_t>,
+                "only 8 bit arithmetic types are supported.");
+  CUTLASS_HOST_DEVICE
+  explicit packed_scale_t(T val) {
+    if constexpr (!cute::is_unsigned_v<T>) {
+      // Only pack negative values. The positive values are generated in flight in the mainloop.
+      storage[0] = pack4(T(float(val) * -8.f), T(float(val) * -7.f), T(float(val) * -6.f), T(float(val) * -5.f));
+      storage[1] = pack4(T(float(val) * -4.f), T(float(val) * -3.f), T(float(val) * -2.f), -val);
+    }
+    else {
+      storage[0] = pack4(T(float(val) * 8.f), T(float(val) * 7.f), T(float(val) * 6.f), T(float(val) * 5.f));
+      storage[1] = pack4(T(float(val) * 4.f), T(float(val) * 3.f), T(float(val) * 2.f), val);
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  packed_scale_t() = default;
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+  CUTLASS_HOST_DEVICE
+  bool operator==(packed_scale_t const& rhs) const {
+    return storage[0] == rhs.storage[0] && storage[1] == rhs.storage[1];
+  }
+  CUTLASS_HOST_DEVICE
+  bool operator!=(packed_scale_t const& rhs) const {
+    return !(*this == rhs);
+  }
+  CUTLASS_HOST_DEVICE
+  friend packed_scale_t operator+(packed_scale_t const& lhs, packed_scale_t const& rhs) {
+    return packed_scale_t(lhs.get() + rhs.get());
+  }
+  CUTLASS_HOST_DEVICE
+  friend packed_scale_t operator-(packed_scale_t const& lhs, packed_scale_t const& rhs) {
+    return packed_scale_t(lhs.get() - rhs.get());
+  }
+  CUTLASS_HOST_DEVICE
+  friend packed_scale_t operator*(packed_scale_t const& lhs, packed_scale_t const& rhs) {
+    return packed_scale_t(lhs.get() * rhs.get());
+  }
+  CUTLASS_HOST_DEVICE
+  friend packed_scale_t operator/(packed_scale_t const& lhs, packed_scale_t const& rhs) {
+    return packed_scale_t(lhs.get() / rhs.get());
+  }
+private:
+  using Storage = uint32_t;
+  using Stage = uint8_t;
+  Storage storage[2] {};
+  CUTLASS_HOST_DEVICE
+  static Storage pack4(T c1, T c2, T c3, T c4) {
+    Storage result = 0;
+    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c4)) << 24);
+    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c3)) << 16);
+    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c2)) << 8);
+    result |= static_cast<Storage>(reinterpret_cast<Stage const&>(c1));
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  T get() const {
+    auto stage = static_cast<Stage>(storage[0] >> 8);
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<T const&>(stage);
+    #else
+    T tmp;
+    std::memcpy(&tmp, &stage, sizeof(Stage));
+    return tmp;
+    #endif
+  }
+  CUTLASS_HOST_DEVICE
+  T get(int idx) const {
+    Stage stage;
+    if (idx < 4) stage = static_cast<Stage>(storage[0] >> (8 * idx));
+    else         stage = static_cast<Stage>(storage[1] >> (8 * idx - 32));
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<T const&>(stage);
+    #else
+    T tmp;
+    std::memcpy(&tmp, &stage, sizeof(Stage));
+    return tmp;
+    #endif
+  }
+};
+// In the mainloop, PRMT selects 1 byte from only 8 bytes so the sign bit is handled in an extra PRMT.
+// Here the encodings of positive values and negative values are unified (except for the sign bit).
+// For instance, 1 becomes 0b0111, which is the same encoding as -1 (0b1111).
+static bool unified_encode_int4b(cutlass::int4b_t const *block_in, cutlass::int4b_t *block_out, const size_t block_size) {
+  using StorageType = cutlass::int4b_t::Storage;
+  constexpr int pack = cute::sizeof_bits_v<StorageType> / 4;
+  const size_t host_buf_size = block_size / pack;
+  std::vector<StorageType> host_buf(host_buf_size);
+  cutlass::device_memory::copy_to_host(host_buf.data(), (StorageType *) block_in, host_buf_size);
+  for (auto&& d : host_buf) {
+    StorageType out = 0;
+    StorageType mask = 0x0f;
+    for (int i = 0; i < pack; i++) {
+      cutlass::int4b_t curr;
+      curr.storage = (d >> (i * 4)) & 0x0f;
+      switch (curr) {
+        case 1: curr.storage = StorageType(0b0111); break; // 2's complement
+        case 2: curr.storage = StorageType(0b0110); break; // 2's complement
+        case 3: curr.storage = StorageType(0b0101); break; // 2's complement
+        case 4: curr.storage = StorageType(0b0100); break; // 2's complement
+        case 5: curr.storage = StorageType(0b0011); break; // 2's complement
+        case 6: curr.storage = StorageType(0b0010); break; // 2's complement
+        case 7: curr.storage = StorageType(0b0001); break; // 2's complement
+        default: break;
+      }
+      out |= (curr.storage << (4 * i)) & mask;
+      mask <<= 4;
+    }
+    d = out;
+  }
+  cutlass::device_memory::copy_to_device((StorageType*) block_out, host_buf.data(), host_buf_size);
+  return true;
+}
+template <class ElementScale>
+static bool pack_scale_fp8(ElementScale const *block_in, cutlass::Array<ElementScale, 8> *block_out, const size_t block_size) {
+  std::vector<ElementScale> data_in(block_size);
+  std::vector<cutlass::Array<ElementScale, 8>> data_out(block_size);
+  try {
+    cutlass::device_memory::copy_to_host(data_in.data(), block_in, block_size);
+  }
+  catch (cutlass::cuda_exception const& e) {
+    std::cerr << "CUDA Error: " << cudaGetErrorString(e.cudaError()) << std::endl;
+    return false;
+  }
+  for (size_t i = 0; i < block_size; i++) {
+    cutlass::packed_scale_t<ElementScale> tmp(data_in[i]);
+    data_out[i] = reinterpret_cast<cutlass::Array<ElementScale, 8> const&>(tmp);
+  }
+  try {
+    cutlass::device_memory::copy_to_device(block_out, data_out.data(), block_size);
+  }
+  catch (cutlass::cuda_exception const& e) {
+    std::cerr << "CUDA Error: " << cudaGetErrorString(e.cudaError()) << std::endl;
+    return false;
+  }
+  return true;
+}
+template <class T, class = void>
+struct UnderlyingElement {
+  using type = T;
+};
+template <class T>
+struct UnderlyingElement<T, cute::void_t<typename T::Element>> {
+  using type = typename T::Element;
+};
+// Given a type of MMA instruction, compute a memory reordering atom that places all values
+// owned by each thread in contiguous memory locations. This improves smem load vectorization,
+// particularly for mixed dtype GEMMs where a narrow type is loaded in the thread/value order
+// of the wider type and may result in inefficient sub-bank (8-bit or 16-bit) accesses.
+// In addition, we can reorder the values across several MMA instructions to get even wider
+// vectorization (AtomLayout parameter) and permute the values within each instruction to get
+// more optimal conversion instruction sequences (ValLayout parameter).
+template <class ElementMma,
+         class AtomLayout = cute::Layout<cute::_1>,
+         class ValLayout  = cute::Layout<cute::_1>>
+constexpr auto compute_memory_reordering_atom(AtomLayout atom_layout = {}, ValLayout val_layout = {})
+{
+  using namespace cute;
+  static_assert(is_static_v<ValLayout>, "ValLayout must be static");
+  static_assert(is_static_v<AtomLayout>, "AtomLayout must be static");
+  // 1. Choose an MMA atom to access TV layout and MN shape
+  // Note: parameters like GMMA Major, TileShape, ElementC don't affect TV layout of A, use arbitrary
+  using MmaAtom = decltype(SM90::GMMA::rs_op_selector<ElementMma, ElementMma, float, Shape<_64,_16,_32>>());
+  using MmaTraits = MMA_Traits<MmaAtom>;
+  auto mk_shape_mma = select<0,2>(typename MmaTraits::Shape_MNK{});
+  auto tv_layout_mma = typename MmaTraits::ALayout{};
+  static_assert(size<1>(tv_layout_mma) % size(val_layout) == 0, "Value layout must evenly divide the MMA value layout");
+  // 2. Create a single warp's TV layout from that of the whole MMA and invert to get (m,k -> thr,val)
+  // Note: this assumes A is partitioned between warps along M mode
+  auto tv_tiler_warp = make_shape(Int<32>{}, size<1>(tv_layout_mma));
+  auto mk_shape_warp = shape_div(mk_shape_mma, size(typename MmaTraits::ThrID{}) / Int<32>{});
+  auto tv_layout_mma_warp = make_layout_like(composition(tv_layout_mma, tv_tiler_warp));
+  auto mk_layout_mma_warp = right_inverse(tv_layout_mma_warp).with_shape(mk_shape_warp);
+  // 3. Repeat the warp layout NumAtoms times along K mode to get wider vectorization
+  auto mk_layout_mma_trgt = blocked_product(mk_layout_mma_warp, atom_layout);
+  // 4. Compose with a contiguous layout of values in each thread (required for smem vectorization)
+  auto val_to_offset = logical_product(val_layout, size<1>(tv_layout_mma) / size(val_layout) * size(atom_layout));
+  auto thr_to_offset = make_layout(size<0>(tv_layout_mma_warp));
+  auto tv_to_offset = select<1,0>(logical_product(val_to_offset, thr_to_offset));
+  auto layout_atom = composition(tv_to_offset, mk_layout_mma_trgt);
+  return layout_atom;
+}
+template <class TileShape, class EngineSrc, class LayoutSrc, class EngineDst, class LayoutDst, class TiledCopy>
+__global__ void reorder_tensor_kernel(
+  cute::Tensor<EngineSrc, LayoutSrc> S,
+  cute::Tensor<EngineDst, LayoutDst> D,
+  TiledCopy tiled_copy)
+{
+  using namespace cute;
+  using T = typename EngineDst::value_type;
+  Tensor gS = local_tile(S, TileShape{}, make_coord(blockIdx.x, _, blockIdx.z));
+  Tensor gD = local_tile(D, TileShape{}, make_coord(blockIdx.x, _, blockIdx.z));
+  auto thread_copy = tiled_copy.get_slice(threadIdx.x);
+  Tensor tS = thread_copy.partition_S(gS);
+  Tensor tD = thread_copy.partition_D(gD);
+  copy(tiled_copy, tS, tD);
+}
+template <class EngineSrc, class LayoutSrc, class EngineDst, class LayoutDst>
+void reorder_tensor(
+  cute::Tensor<EngineSrc, LayoutSrc> S,
+  cute::Tensor<EngineDst, LayoutDst> D)
+{
+  using namespace cute;
+  using T = typename EngineDst::value_type;
+  static_assert(is_same_v<remove_const_t<typename EngineSrc::value_type>, T>, "Type mismatch");
+  // Construct a value layout that assigns at least 8 bits of contiguous elements in destination tensor to a thread
+  // This avoids a race condition when writing out subbyte types (e.g. int4b_t).
+  auto has_major_mode = [](auto s) {
+    return any_of(flatten(s), [](auto a){ return is_constant<1, decltype(a)>{}; });
+  };
+  static_assert(has_major_mode(stride<0>(LayoutDst{})) ^ has_major_mode(stride<1>(LayoutDst{})),
+                "Could not find stride-1 mode in destination layout");
+  constexpr int N = shape_div(Int<8>{}, Int<sizeof_bits_v<T>>{});
+  auto val_layout = conditional_return<has_major_mode(stride<0>(LayoutDst{}))>(
+    make_layout(make_shape(Int<N>{}, Int<1>{}), GenColMajor{}),
+    make_layout(make_shape(Int<1>{}, Int<N>{}), GenRowMajor{}));
+  // Make a tiled copy with a simple row-major thread order and above layout
+  int constexpr NumThreads = 128;
+  auto const thr_layout = make_layout(make_shape(Int<1>{}, Int<NumThreads>{}));
+  auto tiled_copy = make_tiled_copy(Copy_Atom<DefaultCopy, T>{}, thr_layout, val_layout);
+  // Assign a group of 16 rows to a threadblock; this matches the shuffle atom size for Hopper
+  using TileShape = Shape<_16>;
+  auto tiled_D = group_modes<3,rank_v<LayoutDst>>(tiled_divide(D, TileShape{}));
+  dim3 blocks{unsigned(size<1>(tiled_D)), 1u, unsigned(size<3>(tiled_D))};
+  reorder_tensor_kernel<TileShape><<<blocks, NumThreads>>>(S, D, tiled_copy);
+  CUDA_CHECK(cudaDeviceSynchronize());
+}
+// In-place version
+template <class T, class LayoutSrc, class LayoutDst>
+void reorder_tensor(
+  T const* src,
+  LayoutSrc const& layout_src,
+  T * dst,
+  LayoutDst const& layout_dst)
+{
+  using namespace cute;
+  reorder_tensor(make_tensor(make_gmem_ptr<T>(src), layout_src),
+                 make_tensor(make_gmem_ptr<T>(dst), layout_dst));
+}
+// In-place version
+template <class T, class LayoutSrc, class LayoutDst>
+void reorder_tensor(
+  T * data,
+  LayoutSrc const& layout_src,
+  LayoutDst const& layout_dst)
+{
+  using namespace cute;
+  cutlass::DeviceAllocation<T> temp(size(layout_src));
+  reorder_tensor(data, layout_src, temp.get(), layout_dst);
+  cutlass::device_memory::copy_device_to_device(data, temp.get(), static_cast<size_t>(size(layout_src)));
+}
+#undef CUDA_CHECK
+}  // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/packed_stride.hpp ADDED Viewed

	@@ -0,0 +1,570 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
+*/
+#pragma once
+#include "cute/layout.hpp"
+#include "cute/container/array.hpp"   // cute::array
+#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strides without batch mode
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strides with batch mode
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, int64_t>
+make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT, int64_t>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
+  int batch_count =  cute::get<2>(shape_MKL);
+  if (batch_count > 1) {
+    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
+  }
+  else {
+    cute::get<2>(s_copy) = static_cast<IntT>(0);
+  }
+  return s_copy;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strides with group mode
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
+  return s_copy;
+}
+template <class StrideIntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
+make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
+  static_assert(std::is_integral_v<StrideIntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  auto s_copy = s;
+  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
+  return s_copy;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strides for convolutions
+// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
+// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
+// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
+// right in KTRSC order and can be coalesced to just k.
+// We enforce this condition here with asserts.
+template <class IntT, size_t RankT_>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
+    cute::array<int32_t, RankT_> shape_output,
+    cute::array<IntT, RankT_> stride_output,
+    cutlass::conv::Operator conv_op) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  static_assert(RankT_ >= 3u);
+  constexpr static int RankT = static_cast<int>(RankT_);
+  assert(stride_output[RankT-1] == 1);
+  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
+    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
+  });
+  auto s_copy = s;
+  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
+      stride_output[0] :
+      stride_output[RankT-2];
+  return s_copy;
+}
+//
+// Activation tensor ((w, h, d, n), _1) for fprop kernel
+//
+// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_nwc[1];
+  cute::get<0,1>(s_copy) = stride_nwc[0];
+  return s_copy;
+}
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
+  });
+  return s_copy;
+}
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
+  });
+  return s_copy;
+}
+//
+// Filter tensor (k, (_1, s, r, t)) for fprop kernel
+//
+// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+//
+// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
+//
+// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
+//
+// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
+// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
+    cute::array<IntT, 3> stride_nwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nwc[2] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::get<1,0>(s_copy) = stride_nwc[1];
+    cute::get<1,1>(s_copy) = stride_nwc[0];
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nwc in dgrad is ksc.
+    cute::get<1,0>(s_copy) = stride_nwc[0];
+    cute::get<1,1>(s_copy) = stride_nwc[1];
+  }
+  return s_copy;
+}
+// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
+// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
+    cute::array<IntT, 4> stride_nhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nhwc[3] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_nhwc in dgrad is krsc.
+    cute::get<1,0>(s_copy) = stride_nhwc[0];
+    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
+// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
+    cute::array<IntT, 5> stride_ndhwc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_ndhwc[4] == 1);
+  auto s_copy = s;
+  if (ConvOp == cutlass::conv::Operator::kWgrad) {
+    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
+      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
+    });
+  }
+  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
+    // stride_ndhwc in dgrad is ktrsc.
+    cute::get<1,0>(s_copy) = stride_ndhwc[0];
+    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
+    });
+  }
+  return s_copy;
+}
+//
+// NZPQ tensor (_1, nzpq) for wgrad kernel
+//
+// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 3> stride_nqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nqk[2] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nqk[1];
+  return s_copy;
+}
+// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 4> stride_npqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_npqk[3] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_npqk[2];
+  return s_copy;
+}
+// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Int<1>, IntT>
+make_cute_packed_stride(
+    cute::Stride<cute::Int<1>, IntT> s,
+    cute::array<IntT, 5> stride_nzpqk,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_nzpqk[4] == 1);
+  auto s_copy = s;
+  cute::get<1>(s_copy) = stride_nzpqk[3];
+  return s_copy;
+}
+//
+// Wgrad output tensor (k, (_1, s, r, t), _0)
+//
+// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ksc[0];
+  cute::get<1,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<0,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+//
+// Wgrad output tensor ((_1, s, r, t), k, _0)
+//
+// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
+    cute::array<IntT, 3> stride_ksc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_ksc[2] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ksc[0];
+  cute::get<0,1>(s_copy) = stride_ksc[1];
+  return s_copy;
+}
+// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
+    cute::array<IntT, 4> stride_krsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_krsc[3] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_krsc[0];
+  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
+    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
+  });
+  return s_copy;
+}
+// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
+template <class IntT>
+CUTLASS_HOST_DEVICE
+cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
+make_cute_packed_stride(
+    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
+    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
+    cute::array<IntT, 5> stride_ktrsc,
+    conv::Operator ConvOp) {
+  static_assert(std::is_integral_v<IntT>,
+    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
+  assert(stride_ktrsc[4] == 1);
+  auto s_copy = s;
+  cute::get<1,0>(s_copy) = stride_ktrsc[0];
+  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
+    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
+  });
+  return s_copy;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/print_error.hpp ADDED Viewed

	@@ -0,0 +1,341 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <type_traits>
+#include <cute/util/type_traits.hpp>
+#include <cute/tensor.hpp>
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/numeric/complex.hpp>
+#include <cutlass/layout/layout.h>
+// The computed infinity norm does not include
+// any NaN column absolute-value sums.
+struct matrix_inf_norm_result {
+  // Accumulate errors in double, as this is generally
+  // the highest precision that the examples use.
+  double inf_norm = 0.0;
+  bool found_nan = false;
+};
+// In theory, cute::Tensor<ViewEngine<T*>, T> could be treated as a view type,
+// and thus passed by value (as std::span or std::string_view would be).
+// However, generic cute::Tensor are more like containers
+// and thus are best passed by reference or const reference.
+template <typename EngineType, typename LayoutType>
+matrix_inf_norm_result
+matrix_inf_norm(cute::Tensor<EngineType, LayoutType> const& host_matrix)
+{
+  using error_type = decltype(std::declval<matrix_inf_norm_result>().inf_norm);
+  using element_type = typename EngineType::value_type;
+  error_type inf_norm = 0.0;
+  bool found_nan = false;
+  // Computing the infinity norm requires that we be able
+  // to treat the input as a matrix, with rows and columns.
+  const int64_t num_rows = cute::size<0>(host_matrix);
+  const int64_t num_cols = cute::size<1>(host_matrix);
+  auto abs_fn = [] (element_type A_ij) {
+    if constexpr (not std::is_unsigned_v<element_type>) {
+      using std::abs;
+      return abs(A_ij);
+    }
+    else {
+      return A_ij;
+    }
+  };
+  for (int64_t i = 0; i < num_rows; ++i) {
+    error_type row_abs_sum = 0.0;
+    for(int64_t j = 0; j < num_cols; ++j) {
+      row_abs_sum += abs_fn(host_matrix(i, j));
+    }
+    if (std::isnan(row_abs_sum)) {
+      found_nan = true;
+    }
+    else {
+      inf_norm = row_abs_sum > inf_norm ? row_abs_sum : inf_norm;
+    }
+  }
+  return {inf_norm, found_nan};
+}
+// Infinity norm of (X - Y).
+template <typename EngineType, typename LayoutType>
+matrix_inf_norm_result
+matrix_diff_inf_norm(cute::Tensor<EngineType, LayoutType> const& X,
+                     cute::Tensor<EngineType, LayoutType> const& Y)
+{
+  using error_type = decltype(std::declval<matrix_inf_norm_result>().inf_norm);
+  using element_type = typename EngineType::value_type;
+  auto abs_fn = [] (element_type A_ij) {
+    if constexpr (not std::is_unsigned_v<element_type>) {
+      using std::abs;
+      return abs(A_ij);
+    }
+    else {
+      return A_ij;
+    }
+  };
+  assert(cute::size<0>(X) == cute::size<0>(Y));
+  assert(cute::size<1>(X) == cute::size<1>(Y));
+  // Computing the infinity norm requires that we be able
+  // to treat the input as a matrix, with rows and columns.
+  const int64_t num_rows = cute::size<0>(X);
+  const int64_t num_cols = cute::size<1>(X);
+  error_type inf_norm = 0.0;
+  bool found_nan = false;
+  for (int64_t i = 0; i < num_rows; ++i) {
+    error_type row_abs_sum = 0.0;
+    for (int64_t j = 0; j < num_cols; ++j) {
+      row_abs_sum += error_type(abs_fn(element_type(X(i,j)) -
+                                       element_type(Y(i,j))));
+    }
+    if (std::isnan(row_abs_sum)) {
+      found_nan = true;
+    }
+    else {
+      inf_norm = row_abs_sum > inf_norm ? row_abs_sum : inf_norm;
+    }
+  }
+  return {inf_norm, found_nan};
+}
+template <typename EngineType_A, typename LayoutType_A,
+          typename EngineType_B, typename LayoutType_B,
+          typename EngineType_C, typename LayoutType_C,
+          typename EngineType_C_ref, typename LayoutType_C_ref>
+auto
+print_matrix_multiply_mollified_relative_error(
+  char const A_value_type_name[],
+  cute::Tensor<EngineType_A, LayoutType_A> const& A,
+  char const B_value_type_name[],
+  cute::Tensor<EngineType_B, LayoutType_B> const& B,
+  char const C_value_type_name[],
+  cute::Tensor<EngineType_C, LayoutType_C> const& C,
+  cute::Tensor<EngineType_C_ref, LayoutType_C_ref> const& C_ref)
+{
+  const auto [A_norm, A_has_nan] = matrix_inf_norm(A);
+  const auto [B_norm, B_has_nan] = matrix_inf_norm(B);
+  const auto [C_norm, C_has_nan] = matrix_inf_norm(C_ref);
+  const auto [diff_norm, diff_has_nan] = matrix_diff_inf_norm(C, C_ref);
+  const auto A_norm_times_B_norm = A_norm * B_norm;
+  const auto relative_error = A_norm_times_B_norm == 0.0 ?
+    diff_norm : (diff_norm / A_norm_times_B_norm);
+  // For expected error bounds, please refer to the LAPACK Users' Guide,
+  // in particular https://netlib.org/lapack/lug/node108.html .
+  // Printing the infinity norm of C is a way to check
+  // that both the function being tested (C)
+  // and the reference implementation (C_ref)
+  // don't just do nothing (or fill with zeros).
+  using std::cout;
+  using cute::shape;
+  cout << "Matrix A: " << shape<0>(A) << "x" << shape<1>(A) << " of " << A_value_type_name << '\n'
+      << "Matrix B: " << shape<0>(B) << "x" << shape<1>(B) << " of " << B_value_type_name << '\n'
+      << "Matrix C: " << shape<0>(C) << "x" << shape<1>(C) << " of " << C_value_type_name << '\n'
+      << std::scientific
+      << "Infinity norm of A: " << A_norm << '\n'
+      << "Infinity norm of B: " << B_norm << '\n'
+      << "Infinity norm of C: " << C_norm << '\n'
+      << "Infinity norm of (C - C_ref): " << diff_norm << '\n';
+  if(A_norm_times_B_norm == 0.0) {
+    cout << "Mollified relative error: " << relative_error << '\n';
+  } else {
+    cout << "Relative error: " << relative_error << '\n';
+  }
+  if (A_has_nan || B_has_nan || C_has_nan || diff_has_nan) {
+    cout << "Did we encounter NaN in A? " << (A_has_nan ? "yes" : "no") << '\n'
+        << "Did we encounter NaN in B? " << (B_has_nan ? "yes" : "no") << '\n'
+        << "Did we encounter NaN in C? " << (C_has_nan ? "yes" : "no") << '\n'
+        << "Did we encounter NaN in (C - C_ref)? " << (diff_has_nan ? "yes" : "no") << '\n';
+  }
+  return relative_error;
+}
+template <typename EngineType, typename LayoutType>
+auto
+print_matrix_multiply_mollified_relative_error(
+  const char value_type_name[],
+  const cute::Tensor<EngineType, LayoutType>& A,
+  const cute::Tensor<EngineType, LayoutType>& B,
+  const cute::Tensor<EngineType, LayoutType>& C_computed,
+  const cute::Tensor<EngineType, LayoutType>& C_expected)
+{
+  return print_matrix_multiply_mollified_relative_error(value_type_name, A, value_type_name, B,
+                                                 value_type_name, C_computed, C_expected);
+}
+// Take a CUTLASS HostTensor (or the like) as input,
+// and return a const CuTe Tensor.
+// This is useful for use with the above error printing functions.
+// This implicitly "transposes" if the layout is RowMajor.
+// Note that the HostTensor must be captured by nonconst reference
+// in order for X.host_ref().data() to compile.
+// (CUTLASS is a bit more container-y than CuTe.)
+template<class CutlassHostTensorType>
+auto host_matrix_to_const_cute_tensor(CutlassHostTensorType& X)
+{
+  // The tensors were created with post-transposed extents.
+  const auto extents = X.extent();
+  const auto shape = cute::Shape<int, int>{extents[0], extents[1]};
+  // Both RowMajor and ColumnMajor only store one stride.
+  const int LDX = X.stride(0);
+  const auto strides = [&]() {
+      using input_layout_type = typename std::decay_t<decltype(X)>::Layout;
+      if constexpr (std::is_same_v<input_layout_type, cutlass::layout::ColumnMajor>) {
+        return cute::Stride<int, int>{1, LDX};
+      }
+      else {
+        static_assert(std::is_same_v<input_layout_type, cutlass::layout::RowMajor>);
+        return cute::Stride<int, int>{LDX, 1};
+      }
+    }();
+  const auto layout = cute::make_layout(shape, strides);
+  auto X_data = X.host_ref().data();
+  auto X_data_const = const_cast<std::add_const_t< decltype(X_data)> >(X_data);
+  return cute::make_tensor(X_data_const, layout);
+};
+// Returns EXIT_SUCCESS if the 2-norm relative error is exactly zero, else returns EXIT_FAILURE.
+// This makes the return value suitable as the return value of main().
+template <typename T1, typename T2>
+int
+print_relative_error(
+    std::size_t n,
+    T1 const& data,
+    T2 const& reference,
+    bool print_verbose = false,
+    bool print_error = true,
+    double error_margin = 0.00001) {
+  using std::abs; using std::sqrt;
+  // Use either double or complex<double> for error computation
+  using value_type = cute::remove_cvref_t<decltype(reference[0])>;
+  using error_type = std::conditional_t<cute::is_complex<value_type>::value,
+                                        cute::complex<double>,
+                                        double>;
+  if (print_verbose) {
+    std::cout << "Idx:\t"<< "Val\t" << "RefVal\t" << "RelError" << std::endl;
+  }
+  double eps = 1e-200;
+  double tot_error_sq = 0;
+  double tot_norm_sq = 0;
+  double tot_ind_rel_err = 0;
+  double max_ind_rel_err = 0;
+  double max_diff = 0;
+  for (std::size_t i = 0; i < n; ++i) {
+    error_type val = data[i];
+    error_type ref = reference[i];
+    double aref = abs(ref);
+    double diff = abs(ref - val);
+    double rel_error = diff / (aref + eps);
+    // Individual relative error
+    tot_ind_rel_err += rel_error;
+    // Maximum relative error
+    max_ind_rel_err  = std::max(max_ind_rel_err, rel_error);
+    // Maximum delta in value error
+    max_diff = std::max(max_diff, diff);
+    // Total relative error
+    tot_error_sq += diff * diff;
+    tot_norm_sq  += aref * aref;
+    if (print_verbose) {
+      std::cout << i << ":\t" << val << "\t" << ref << "\t" << rel_error << std::endl;
+    }
+  }
+  double ave_rel_err = tot_ind_rel_err / double(n);
+  if (print_error) {
+    printf("Average relative error: %.3e\n", ave_rel_err);
+  }
+  if (print_error) {
+    printf("Maximum relative error: %.3e\n", max_ind_rel_err);
+  }
+  if (print_error) {
+    printf("Maximum difference    : %.3e\n", max_diff);
+  }
+  double tot_rel_err = sqrt(tot_error_sq/(tot_norm_sq+eps));
+  if (print_error) {
+    printf("Vector relative error:  %.3e\n", tot_rel_err);
+  }
+  printf("Vector reference  norm: %.3e\n", sqrt(tot_norm_sq));
+  return (tot_rel_err <= error_margin) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+// Overload for cute::Tensor<>
+template <class Engine, class Layout>
+int
+print_relative_error(
+    cute::Tensor<Engine, Layout> data,
+    cute::Tensor<Engine, Layout> reference,
+    bool print_verbose = false,
+    bool print_error = true,
+    double error_margin = 0.00001) {
+  assert(size(data) == size(reference));
+  return print_relative_error(static_cast<std::size_t>(size(data)),
+                              data, reference,
+                              print_verbose, print_error, error_margin);
+}

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/detail/inner_product.h ADDED Viewed

	@@ -0,0 +1,135 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+namespace cutlass {
+namespace reference {
+namespace detail {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Template function to compute an inner product.
+#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate with a
+                            // host-only type
+template <typename Atype, typename Btype, typename Ctype>
+CUTLASS_HOST_DEVICE
+Ctype inner_product(Atype a, Btype b, Ctype c) {
+  return Ctype(a) * Ctype(b) + c;
+}
+/// Specialization for matrix multiplication with binary operands
+template <>
+CUTLASS_HOST_DEVICE
+int inner_product<Array<bin1_t, 32>, Array<bin1_t, 32>, int>(
+    Array<bin1_t, 32> a,
+    Array<bin1_t, 32> b,
+    int c) {
+  int accum = 0;
+  for (int bit = 0; bit < 32; bit++) {
+    accum += a[bit] ^ b[bit];
+  }
+  return accum + c;
+}
+/*
+/// Specialization for matrix multiplication with signed 4-bit integer operands
+template <>
+CUTLASS_HOST_DEVICE
+int inner_product<Array<int4b_t, 8>, Array<int4b_t, 8>, int>(
+    Array<int4b_t, 8> a,
+    Array<int4b_t, 8> b,
+    int c) {
+  int accum = 0;
+  for (int k = 0; k < 8; k++) {
+    accum += a[k] * b[k];
+  }
+  return accum + c;
+}
+/// Specialization for matrix multiplication with unsigned 4-bit integer operands
+template <>
+CUTLASS_HOST_DEVICE
+int inner_product<Array<uint4b_t, 8>, Array<uint4b_t, 8>, int>(
+    Array<uint4b_t, 8> a,
+    Array<uint4b_t, 8> b,
+    int c) {
+  int accum = 0;
+  for (int k = 0; k < 8; k++) {
+    accum += a[k] * b[k];
+  }
+  return accum + c;
+}
+*/
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename SrcType, typename DstType>
+struct Cast {
+  // Default behavior: convert to the destination type
+#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate complex<T> with a
+                            // host-only type
+  CUTLASS_HOST_DEVICE
+  static DstType apply(SrcType src) { return static_cast<DstType>(src); };
+};
+template <>
+struct Cast<float, int8_t> {
+  CUTLASS_HOST_DEVICE
+  static int8_t apply(float src) {
+    // Clamp to the range of signed 8-bit integers.
+    return static_cast<int8_t>(fmaxf(-128.f, fminf(127.f, src)));
+  };
+};
+template <>
+struct Cast<float, uint8_t> {
+  CUTLASS_HOST_DEVICE
+  static uint8_t apply(float src) {
+    // Clamp to the range of signed 8-bit integers.
+    return static_cast<uint8_t>(fmaxf(0.f, fminf(255.f, src)));
+  };
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h ADDED Viewed

	@@ -0,0 +1,94 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reference {
+namespace detail {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <int Rank, int Index>
+struct LinearToCoordinateHelper {
+  CUTLASS_HOST_DEVICE
+  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
+    int64_t prod = 1;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = Rank - Index; i < Rank; ++i) {
+      prod *= int64_t(extent[i]);
+    }
+    coord[Rank - Index - 1] = int(idx / prod);
+    int64_t residual = idx % prod;
+    LinearToCoordinateHelper<Rank, Index - 1>()(coord, residual, extent);
+  }
+};
+template <int Rank>
+struct LinearToCoordinateHelper<Rank, 0> {
+  CUTLASS_HOST_DEVICE
+  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &) const {
+    coord[Rank - 1] = int(idx);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <int Rank>
+struct LinearToCoordinate {
+  CUTLASS_HOST_DEVICE
+  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
+    LinearToCoordinateHelper<Rank, Rank - 1>()(coord, idx, extent);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+} // namespace reference
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/convolution.h ADDED Viewed

	@@ -0,0 +1,1549 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for convolution in device-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace kernel {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+///                                   Conv2d device reference kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Conv2d Fprop kernel - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+  int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  int thread_n[kThreadM];
+  int thread_p[kThreadM];
+  int thread_q[kThreadM];
+  // Compute N, P, Q coordinates for each row of a thread's tile
+  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int64_t npq = npq_start + m;
+    thread_n[m] = int(npq / PQ);
+    int64_t residual = npq % PQ;
+    thread_p[m] = int(residual / problem_size.Q);
+    thread_q[m] = int(residual % problem_size.Q);
+  }
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+  int c_per_group = problem_size.C / problem_size.groups;
+  int k_per_group = problem_size.K / problem_size.groups;
+  // Compute convolution
+  for (int R = 0; R < problem_size.R; ++R) {
+    for (int S = 0; S < problem_size.S; ++S) {
+      for (int C = 0; C < problem_size.C; ++C) {
+        // Get group id of currnet channel
+        int c_group_idx = C / c_per_group;
+        // Load from activations tensor
+        int filter_r = R;
+        int filter_s = S;
+        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+          filter_r = problem_size.R - 1 - R;
+          filter_s = problem_size.S - 1 - S;
+        }
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+          int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+          if (thread_n[m] < problem_size.N && h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
+            element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], h, w, C}));
+          }
+          else {
+            element_A[m] = ElementAccumulator();
+          }
+        }
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          int thread_k = k_start + n;
+          int k_group_idx = thread_k / k_per_group;
+          if (thread_k < problem_size.K && k_group_idx == c_group_idx) {
+            element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C % c_per_group}));
+          }
+          else {
+            element_B[n] = ElementAccumulator();
+          }
+        }
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    if (thread_n[m] < problem_size.N && thread_p[m] < problem_size.P && thread_q[m] < problem_size.Q) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_k = k_start + n;
+        if (thread_k < problem_size.K) {
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
+          }
+          tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      }
+    }
+  }
+}
+// Conv3d Fprop kernel - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator =  ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+  int64_t nzpq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  int thread_n[kThreadM];
+  int thread_z[kThreadM];
+  int thread_p[kThreadM];
+  int thread_q[kThreadM];
+  // Compute N, Z, P, Q coordinates for each row of a thread's tile
+  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
+  int64_t ZPQ = PQ * problem_size.Z;
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int64_t nzpq = nzpq_start + m;
+    thread_n[m] = int(nzpq / ZPQ);
+    int64_t residual = nzpq % ZPQ;
+    thread_z[m] = int(residual / PQ);
+    residual = residual % PQ;
+    thread_p[m] = int(residual / problem_size.Q);
+    thread_q[m] = int(residual % problem_size.Q);
+  }
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+  // Compute convolution
+  for (int T = 0; T < problem_size.T; ++T) {
+    for (int R = 0; R < problem_size.R; ++R) {
+      for (int S = 0; S < problem_size.S; ++S) {
+        for (int C = 0; C < problem_size.C; ++C) {
+          // Load from activations tensor
+          int filter_t = T;
+          int filter_r = R;
+          int filter_s = S;
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_t = problem_size.T - 1 - T;
+            filter_r = problem_size.R - 1 - R;
+            filter_s = problem_size.S - 1 - S;
+          }
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            int d = thread_z[m] * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
+            int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+            int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+            if (thread_n[m] < problem_size.N &&
+              d >= 0 && d < problem_size.D &&
+              h >= 0 && h < problem_size.H &&
+              w >= 0 && w < problem_size.W) {
+              element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], d, h, w, C}));
+            }
+            else {
+              element_A[m] = ElementAccumulator();
+            }
+          }
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            int thread_k = k_start + n;
+            if (thread_k < problem_size.K) {
+              element_B[n] = ElementAccumulator(tensor_w.at({thread_k, T, R, S, C}));
+            }
+            else {
+              element_B[n] = ElementAccumulator();
+            }
+          }
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+        } // for (C)
+      } // for (S)
+    }  // for (R)
+  } // for (T)
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    if (thread_n[m] < problem_size.N &&
+      thread_z[m] < problem_size.Z &&
+      thread_p[m] < problem_size.P &&
+      thread_q[m] < problem_size.Q) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_k = k_start + n;
+        if (thread_k < problem_size.K) {
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}));
+          }
+          tensor_y_out.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      } // for (n)
+    }
+  } // for (m)
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Conv2d dgrad kernel - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv2dDgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+  int64_t nhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  int thread_n[kThreadM];
+  int thread_h[kThreadM];
+  int thread_w[kThreadM];
+  // Compute N, H, W coordinates for each row of a thread's tile
+  int64_t HW = int64_t(problem_size.H) * problem_size.W;
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int64_t nhw = nhw_start + m;
+    thread_n[m] = int(nhw / HW);
+    int64_t residual = nhw % HW;
+    thread_h[m] = int(residual / problem_size.W);
+    thread_w[m] = int(residual % problem_size.W);
+  }
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+  // Compute convolution
+  for (int R = 0; R < problem_size.R; ++R) {
+    for (int S = 0; S < problem_size.S; ++S) {
+      for (int K = 0; K < problem_size.K; ++K) {
+        // Load from activations tensor
+        int filter_r = R;
+        int filter_s = S;
+        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+          filter_r = problem_size.R - 1 - R;
+          filter_s = problem_size.S - 1 - S;
+        }
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
+          int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
+          element_A[m] = ElementAccumulator();
+          if (p >= 0 && !(p % problem_size.stride_h) && q >= 0 && !(q % problem_size.stride_w)) {
+            p = p / problem_size.stride_h;
+            q = q / problem_size.stride_w;
+            if (thread_n[m] < problem_size.N && p < problem_size.P && q < problem_size.Q) {
+              element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], p, q, K}));
+            }
+          }
+        }
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          int thread_c = c_start + n;
+          if (thread_c < problem_size.C) {
+            element_B[n] = ElementAccumulator(tensor_w.at({K, R, S, thread_c}));
+          }
+          else {
+            element_B[n] = ElementAccumulator();
+          }
+        }
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    if (thread_n[m] < problem_size.N && thread_h[m] < problem_size.H && thread_w[m] < problem_size.W) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_c = c_start + n;
+        if (thread_c < problem_size.C) {
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_h[m], thread_w[m], thread_c}));
+          }
+          tensor_dx_out.at({thread_n[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      }
+    }
+  }
+}
+// Conv3d dgrad kernel - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
+  int kCtaShapeN = 8      // shape of a threadblock in units of threads
+>
+__global__ void Conv3dDgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+  int64_t ndhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  int thread_n[kThreadM];
+  int thread_d[kThreadM];
+  int thread_h[kThreadM];
+  int thread_w[kThreadM];
+  // Compute N, H, W coordinates for each row of a thread's tile
+  int64_t HW = int64_t(problem_size.H) * problem_size.W;
+  int64_t DHW = HW * problem_size.D;
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int64_t ndhw = ndhw_start + m;
+    thread_n[m] = int(ndhw / DHW);
+    int64_t residual = ndhw % DHW;
+    thread_d[m] = int(residual / HW);
+    residual = residual % HW;
+    thread_h[m] = int(residual / problem_size.W);
+    thread_w[m] = int(residual % problem_size.W);
+  }
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+  // Compute convolution
+  for (int T = 0; T < problem_size.T; ++T) {
+    for (int R = 0; R < problem_size.R; ++R) {
+      for (int S = 0; S < problem_size.S; ++S) {
+        for (int K = 0; K < problem_size.K; ++K) {
+          // Load from activations tensor
+          int filter_t = T;
+          int filter_r = R;
+          int filter_s = S;
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_t = problem_size.T - 1 - T;
+            filter_r = problem_size.R - 1 - R;
+            filter_s = problem_size.S - 1 - S;
+          }
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            int z = thread_d[m] + problem_size.pad_d - filter_t * problem_size.dilation_d;
+            int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
+            int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
+            element_A[m] = ElementAccumulator();
+            if (z >= 0 && !(z % problem_size.stride_d) &&
+              p >= 0 && !(p % problem_size.stride_h) &&
+              q >= 0 && !(q % problem_size.stride_w)) {
+              z = z / problem_size.stride_d;
+              p = p / problem_size.stride_h;
+              q = q / problem_size.stride_w;
+              if (thread_n[m] < problem_size.N && z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
+                element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], z, p, q, K}));
+              }
+            }
+          }
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            int thread_c = c_start + n;
+            if (thread_c < problem_size.C) {
+              element_B[n] = ElementAccumulator(tensor_w.at({K, T, R, S, thread_c}));
+            }
+            else {
+              element_B[n] = ElementAccumulator();
+            }
+          }
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+        } // for (C)
+      } // for (S)
+    } // for (R)
+  } // for (T)
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    if (thread_n[m] < problem_size.N &&
+      thread_d[m] < problem_size.D &&
+      thread_h[m] < problem_size.H &&
+      thread_w[m] < problem_size.W) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        int thread_c = c_start + n;
+        if (thread_c < problem_size.C) {
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}));
+          }
+          tensor_dx_out.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      }
+    }
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Conv2d wgrad kernel - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
+  int kCtaShapeN = 16     // shape of a threadblock in units of threads
+>
+__global__ void Conv2dWgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int64_t rsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  int thread_r[kThreadN];
+  int thread_s[kThreadN];
+  int thread_c[kThreadN];
+  // Compute R, S, C coordinates for each row of a thread's tile
+  int64_t SC = int64_t(problem_size.S) * problem_size.C;
+  CUTLASS_PRAGMA_UNROLL
+  for (int n = 0; n < kThreadN; ++n) {
+    int64_t rsc = rsc_start + n;
+    int64_t residual = rsc % SC;
+    thread_r[n] = int(rsc / SC);
+    thread_s[n] = int(residual / problem_size.C);
+    thread_c[n] = int(residual % problem_size.C);
+  }
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+  // Compute convolution
+  for (int N = 0; N < problem_size.N; ++N) {
+    for (int P = 0; P < problem_size.P; ++P) {
+      for (int Q = 0; Q < problem_size.Q; ++Q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          int thread_k = k_start + m;
+          element_A[m] = ElementAccumulator();
+          if (thread_k < problem_size.K) {
+            element_A[m] = ElementAccumulator(tensor_dy.at({N, P, Q, thread_k}));
+          }
+        }
+        // Load from filters tensor
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < kThreadN; ++n) {
+          // Load from activations tensor
+          int filter_r = thread_r[n];
+          int filter_s = thread_s[n];
+          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+            filter_r = problem_size.R - 1 - filter_r;
+            filter_s = problem_size.S - 1 - filter_s;
+          }
+          int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+          int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+          element_B[n] = ElementAccumulator();
+          if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W && thread_c[n] < problem_size.C) {
+            element_B[n] = ElementAccumulator(tensor_x.at({N, h, w, thread_c[n]}));
+          }
+        }
+        // Accumulate matrix product
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < kThreadM; ++m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+          }
+        }
+      }
+    }
+  }
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int thread_k = k_start + m;
+    if (thread_k < problem_size.K) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        if (thread_r[n] < problem_size.R && thread_s[n] < problem_size.S && thread_c[n] < problem_size.C) {
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}));
+          }
+          tensor_dw_out.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      }
+    }
+  }
+}
+// Conv3d wgrad kernel - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>,
+  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
+  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
+  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
+  int kCtaShapeN = 16     // shape of a threadblock in units of threads
+>
+__global__ void Conv3dWgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta
+  ) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  ElementAccumulator element_A[kThreadM];
+  ElementAccumulator element_B[kThreadN];
+  ElementAccumulator accum[kThreadM][kThreadN];
+  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
+  int64_t trsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
+  int thread_t[kThreadN];
+  int thread_r[kThreadN];
+  int thread_s[kThreadN];
+  int thread_c[kThreadN];
+  // Compute R, S, C coordinates for each row of a thread's tile
+  int64_t SC = int64_t(problem_size.S) * problem_size.C;
+  int64_t RSC = SC * problem_size.R;
+  CUTLASS_PRAGMA_UNROLL
+  for (int n = 0; n < kThreadN; ++n) {
+    int64_t trsc = trsc_start + n;
+    thread_t[n] = int(trsc / RSC);
+    int64_t residual = trsc % RSC;
+    thread_r[n] = int(residual / SC);
+    residual = residual % SC;
+    thread_s[n] = int(residual / problem_size.C);
+    thread_c[n] = int(residual % problem_size.C);
+  }
+  // Clear accumulators
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kThreadN; ++n) {
+      accum[m][n] = ElementAccumulator();
+    }
+  }
+  // Compute convolution
+  for (int N = 0; N < problem_size.N; ++N) {
+    for (int Z = 0; Z < problem_size.Z; ++Z) {
+      for (int P = 0; P < problem_size.P; ++P) {
+        for (int Q = 0; Q < problem_size.Q; ++Q) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            int thread_k = k_start + m;
+            element_A[m] = ElementAccumulator();
+            if (thread_k < problem_size.K) {
+              element_A[m] = ElementAccumulator(tensor_dy.at({N, Z, P, Q, thread_k}));
+            }
+          }
+          // Load from filters tensor
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < kThreadN; ++n) {
+            // Load from activations tensor
+            int filter_t = thread_t[n];
+            int filter_r = thread_r[n];
+            int filter_s = thread_s[n];
+            if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+              filter_t = problem_size.T - 1 - filter_t;
+              filter_r = problem_size.R - 1 - filter_r;
+              filter_s = problem_size.S - 1 - filter_s;
+            }
+            int d = Z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
+            int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+            int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+            element_B[n] = ElementAccumulator();
+            if (d >= 0 && d < problem_size.D &&
+              h >= 0 && h < problem_size.H &&
+              w >= 0 && w < problem_size.W &&
+              thread_c[n] < problem_size.C) {
+              element_B[n] = ElementAccumulator(tensor_x.at({N, d, h, w, thread_c[n]}));
+            }
+          }
+          // Accumulate matrix product
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < kThreadM; ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < kThreadN; ++n) {
+              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
+            }
+          }
+        } // for (Q)
+      } // for (P)
+    } // for (Z)
+  } // for (N)
+  // Write out the results
+  CUTLASS_PRAGMA_UNROLL
+  for (int m = 0; m < kThreadM; ++m) {
+    int thread_k = k_start + m;
+    if (thread_k < problem_size.K) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < kThreadN; ++n) {
+        if (thread_t[n] < problem_size.T &&
+          thread_r[n] < problem_size.R &&
+          thread_s[n] < problem_size.S &&
+          thread_c[n] < problem_size.C) {
+          ElementCompute c_ref = ElementCompute();
+          if (beta != ElementCompute()) {
+            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}));
+          }
+          tensor_dw_out.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
+            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
+        }
+      }
+    }
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Conv2d Fprop dispatcher - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+  int64_t npq = int64_t(problem_size.N) * problem_size.P * problem_size.Q;
+  int64_t blocks_m = (npq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+  kernel::Conv2dFprop<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_x,
+    tensor_w,
+    tensor_y_in,
+    tensor_y_out,
+    alpha,
+    beta
+  );
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+  return Status::kSuccess;
+}
+/// Conv3d Fprop dispatcher - y = fprop(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+  int64_t nzpq = int64_t(problem_size.N) * problem_size.Z * problem_size.P * problem_size.Q;
+  int64_t blocks_m = (nzpq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+  kernel::Conv3dFprop<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_x,
+    tensor_w,
+    tensor_y_in,
+    tensor_y_out,
+    alpha,
+    beta
+  );
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+  return Status::kSuccess;
+}
+/// Conv2d Dgrad dispatcher - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dDgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+  int64_t nhw = int64_t(problem_size.N) * problem_size.H * problem_size.W;
+  int64_t blocks_m = (nhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+  kernel::Conv2dDgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_w,
+    tensor_dx_in,
+    tensor_dx_out,
+    alpha,
+    beta
+  );
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+  return Status::kSuccess;
+}
+/// Conv3d Dgrad dispatcher - dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dDgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
+  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
+  int64_t ndhw = int64_t(problem_size.N) * problem_size.D * problem_size.H * problem_size.W;
+  int64_t blocks_m = (ndhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
+  kernel::Conv3dDgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_w,
+    tensor_dx_in,
+    tensor_dx_out,
+    alpha,
+    beta
+  );
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+  return Status::kSuccess;
+}
+/// Conv2d Wgrad dispatcher - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2dWgrad(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
+  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
+  int64_t rsc = int64_t(problem_size.R) * problem_size.S * problem_size.C;
+  int64_t blocks_n = (rsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
+  kernel::Conv2dWgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_x,
+    tensor_dw_in,
+    tensor_dw_out,
+    alpha,
+    beta
+  );
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+  return Status::kSuccess;
+}
+/// Conv3d Wgrad dispatcher - dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3dWgrad(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  //
+  // Blocking factors improve performance of reference implementation
+  //
+  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
+  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
+  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
+  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
+  int64_t trsc = int64_t(problem_size.T) * problem_size.R * problem_size.S * problem_size.C;
+  int64_t blocks_n = (trsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
+  dim3 block(kCtaShapeM, kCtaShapeN);
+  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
+  kernel::Conv3dWgrad<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    ElementAccumulator,
+    ConvertOp,
+    InnerProductOp,
+    kThreadM,
+    kThreadN,
+    kCtaShapeM,
+    kCtaShapeN
+  ><<< grid, block, 0, stream >>>(
+    problem_size,
+    tensor_dy,
+    tensor_x,
+    tensor_dw_in,
+    tensor_dw_out,
+    alpha,
+    beta
+  );
+  cudaError_t result = cudaPeekAtLastError();
+  if (result != cudaSuccess) {
+    return Status::kErrorInternal;
+  }
+  return Status::kSuccess;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv2d(
+  conv::Operator convolutional_operator,
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    return Conv2dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+  case conv::Operator::kDgrad:
+    return Conv2dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+  case conv::Operator::kWgrad:
+    return Conv2dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+    break;
+  default: break;
+  }
+  return Status::kErrorNotSupported;
+}
+/// Generic 3D convolution targeting Conv3dFprop, Conv3dDgrad, and Conv3dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+Status Conv3d(
+  conv::Operator convolutional_operator,
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cudaStream_t stream = nullptr) {
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    return Conv3dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+  case conv::Operator::kDgrad:
+    return Conv3dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+  case conv::Operator::kWgrad:
+    return Conv3dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
+  default: break;
+  }
+  return Status::kErrorNotSupported;
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace device
+}  // namespace reference
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm.h ADDED Viewed

	@@ -0,0 +1,385 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in device-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/reference/device/kernel/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename InnerProductOp = multiply_add<AccumulatorType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  AccumulatorType initial_accum) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  // Blocking structure potentially improves performance of reference implementation
+  // with a minor increase in complexity.
+  //
+  // Note, this reference implementation is NOT expected to approach peak performance.
+  using OutputTile = MatrixShape<4, 4>;
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
+  );
+  // Launch a GEMM kernel
+  kernel::Gemm<
+    TensorRef<ElementA, LayoutA>,
+    TensorRef<ElementB, LayoutB>,
+    TensorRef<ElementC, LayoutC>,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    tensor_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    initial_accum
+  );
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename InnerProductOp = multiply_add<AccumulatorType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  AccumulatorType initial_accum) {
+  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                ScalarType, AccumulatorType, InnerProductOp, ConvertOp>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+        initial_accum);
+}
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Gemm;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename AccumulatorType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+            ScalarType, AccumulatorType, arch::OpMultiplyAdd> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+      "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                  ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+      "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add-saturate
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename AccumulatorType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            AccumulatorType, arch::OpMultiplyAddSaturate> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
+                 NumericConverterClamp<ElementC, ScalarType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
+                 NumericConverterClamp<ElementC, ScalarType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for XOR-popc
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename AccumulatorType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            AccumulatorType, arch::OpXorPopc> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  AccumulatorType initial_accum = AccumulatorType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Batched GEMM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a batch of GEMMs over a set of matrices of common dimension.
+//
+// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
+//
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename InnerProductOp,
+  typename ConvertOp
+>
+void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  int batch_count,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c,
+  AccumulatorType initial_accum) {
+  static_assert(
+    TensorRefCollectionA::kRank == 2 &&
+    TensorRefCollectionB::kRank == 2 &&
+    TensorRefCollectionC::kRank == 2, "Tensors must be of rank 2");
+  // Blocking structure potentially improves performance of reference implementation
+  // with a minor increase in complexity.
+  //
+  // Note, this reference implementation is NOT expected to approach peak performance.
+  using OutputTile = MatrixShape<4, 4>;
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn),
+    batch_count
+  );
+  // Launch a GEMM kernel
+  kernel::BatchedGemm<
+    TensorRefCollectionA,
+    TensorRefCollectionB,
+    TensorRefCollectionC,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    tensor_b,
+    beta,
+    tensor_c,
+    initial_accum
+  );
+}
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+//
+// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
+//
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  int batch_count,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c) {
+  BatchedGemm(problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm_complex.h ADDED Viewed

	@@ -0,0 +1,350 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in device-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace kernel {
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  int kMblock = 4,
+  int kNblock = 4
+>
+__global__ void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
+  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock;
+  int batch_idx = blockIdx.z;
+  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
+  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
+  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
+  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
+  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
+    // Compute matrix product using blocks
+    ComputeType accum[kMblock][kNblock];
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        accum[i][j] = initial_accum;
+      }
+    }
+    for (int k_block = 0; k_block < K; ++k_block) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kNblock; j++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kMblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+          if (row < M && col < N) {
+            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+            ElementB b = tensor_b.at(MatrixCoord(k_block, col));
+            ComputeType a_ik = ComputeType(a);
+            ComputeType b_kj = ComputeType(b);
+            if (transform_a == ComplexTransform::kConjugate) {
+              a_ik = conj(a_ik);
+            }
+            if (transform_b == ComplexTransform::kConjugate) {
+              b_kj = conj(b_kj);
+            }
+            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+          }
+        }
+      }
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        int row = row_block + i;
+        int col = col_block + j;
+        MatrixCoord coord = MatrixCoord(row, col);
+        if (row < M && col < N) {
+          tensor_d.at(coord) = convert_op(
+            alpha * ScalarType(accum[i][j]) +
+            beta * ScalarType(tensor_c.at(coord)));
+        }
+      }
+    }
+    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
+    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
+    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
+    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
+  } // for (batch_idx)
+}
+} // namespace kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  int const kMblock = 4;
+  int const kNblock = 4;
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
+    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
+    batch_count % std::numeric_limits<uint16_t>::max()
+  );
+  if (grid.y <= std::numeric_limits<uint16_t>::max()) {
+    kernel::GemmComplex<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ScalarType,
+      ComputeType,
+      ElementD,
+      ConvertOp,
+      InnerProductOp,
+      kMblock,
+      kNblock
+    ><<< grid, block >>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      transform_a,
+      tensor_b,
+      transform_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum,
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    );
+  } else {
+    // Using bigger thread tile size
+    int const kBigMblock = 4;
+    int const kBigNblock = 16;
+    dim3 Bigblock(16, 8);
+    dim3 Biggrid(
+      (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock),
+      (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock),
+      batch_count % std::numeric_limits<uint16_t>::max()
+    );
+    kernel::GemmComplex<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ScalarType,
+      ComputeType,
+      ElementD,
+      ConvertOp,
+      InnerProductOp,
+      kBigMblock,
+      kBigNblock
+    ><<< Biggrid, Bigblock >>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      transform_a,
+      tensor_b,
+      transform_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum,
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    );
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ElementD = ElementC
+>
+void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d) {
+  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h ADDED Viewed

	@@ -0,0 +1,311 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in device code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_ref_planar_complex.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace kernel {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+static int const kGemmPlanarComplexBlockSize = 4;
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<complex<ComputeType>>
+>
+__global__ void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
+  complex<ComputeType> initial_accum) {
+  int const kMblock = kGemmPlanarComplexBlockSize;
+  int const kNblock = kGemmPlanarComplexBlockSize;
+  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
+  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
+  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  complex<ComputeType> accum[kMblock][kNblock];
+  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
+  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock;
+  CUTLASS_PRAGMA_UNROLL
+  for (int j = 0; j < kNblock; j++) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kMblock; i++) {
+      accum[i][j] = initial_accum;
+    }
+  }
+  CUTLASS_PRAGMA_NO_UNROLL
+  for (int k_block = 0; k_block < K; ++k_block) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        int row = row_block + i;
+        int col = col_block + j;
+        if (row < M && col < N) {
+          ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
+          ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
+          complex<ComputeType> a = complex<ComputeType>{
+            ComputeType(a_ik.real()),
+            ComputeType(a_ik.imag())
+          };
+          complex<ComputeType> b = complex<ComputeType>{
+            ComputeType(b_kj.real()),
+            ComputeType(b_kj.imag())
+          };
+          if (transform_a == ComplexTransform::kConjugate) {
+            a = conj(a);
+          }
+          if (transform_b == ComplexTransform::kConjugate) {
+            b = conj(b);
+          }
+          accum[i][j] = inner_product_op(a, b,  accum[i][j]);
+        }
+      }
+    }
+  }
+  CUTLASS_PRAGMA_UNROLL
+  for (int j = 0; j < kNblock; j++) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kMblock; i++) {
+      int row = row_block + i;
+      int col = col_block + j;
+      MatrixCoord coord = MatrixCoord(row, col);
+      if (row < M && col < N) {
+        complex<ScalarType> acc{
+          ScalarType(accum[i][j].real()),
+          ScalarType(accum[i][j].imag())
+        };
+        ComplexC c_ij = ComplexC();
+        if (beta.real() != ScalarType() || beta.imag() != ScalarType()) {
+          c_ij = tensor_c.at(coord);
+        }
+        complex<ScalarType> src{
+          ScalarType(c_ij.real()),
+          ScalarType(c_ij.imag())
+        };
+        complex<ScalarType> result = alpha * acc + beta * src;
+        ComplexC d_ij;
+        d_ij.real() = convert_op(result.real());
+        d_ij.imag() = convert_op(result.imag());
+        tensor_d.at(coord) = d_ij;
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<complex<ComputeType>>
+>
+void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
+  complex<ComputeType> initial_accum) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  int const kMblock = kernel::kGemmPlanarComplexBlockSize;
+  int const kNblock = kernel::kGemmPlanarComplexBlockSize;
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
+    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
+    1);
+  kernel::GemmPlanarComplex<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ScalarType,
+    ComputeType,
+    ConvertOp,
+    InnerProductOp
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    transform_a,
+    tensor_b,
+    transform_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    initial_accum
+  );
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
+  GemmPlanarComplex(
+    problem_size,
+    alpha,
+    tensor_a, transform_a,
+    tensor_b, transform_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    complex<ScalarType>());
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reference
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gett.hpp ADDED Viewed

	@@ -0,0 +1,146 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief GETT device reference code
+*/
+#pragma once
+#include <cute/tensor.hpp>
+namespace cutlass::reference::device {
+template <
+  class ATensor,
+  class BTensor,
+  class CTensor,
+  class DTensor,
+  class ElementAccumulator,
+  class ElementEpilogue>
+__global__ static
+void
+gett_kernel(
+  DTensor       D,
+  ATensor const A,
+  BTensor const B,
+  CTensor const C,
+  ElementEpilogue alpha, ElementEpilogue beta,
+  ElementAccumulator acc_init)
+{
+  using namespace cute;
+  static_assert(DTensor::rank == 3, "(M,N,L)");
+  static_assert(ATensor::rank == 3, "(M,K,L)");
+  static_assert(BTensor::rank == 3, "(N,K,L)");
+  static_assert(CTensor::rank == 3, "(M,N,L)");
+  assert(size<0>(A) == size<0>(D));  // M
+  assert(size<0>(C) == size<0>(D));  // M
+  assert(size<0>(B) == size<1>(D));  // N
+  assert(size<1>(C) == size<1>(D));  // N
+  assert(size<1>(A) == size<1>(B));  // K
+  assert(size<2>(A) == size<2>(D));  // L
+  assert(size<2>(B) == size<2>(D));  // L
+  assert(size<2>(C) == size<2>(D));  // L
+  NumericConverter<ElementAccumulator, typename ATensor::value_type> a_converter;
+  NumericConverter<ElementAccumulator, typename BTensor::value_type> b_converter;
+  NumericConverter<ElementEpilogue, ElementAccumulator> acc_converter;
+  NumericConverter<ElementEpilogue, typename CTensor::value_type> source_converter;
+  NumericConverter<typename DTensor::value_type, ElementEpilogue> output_converter;
+  // Thread id to each element of D
+  for (int tid = threadIdx.x + blockDim.x * blockIdx.x;
+       tid < size(D);
+       tid += blockDim.x * gridDim.x) {
+    // (m,n,l) coordinate
+    auto mnl_coord = idx2crd(tid, product_each(shape(D)));
+    auto m = get<0>(mnl_coord);
+    auto n = get<1>(mnl_coord);
+    auto l = get<2>(mnl_coord);
+    auto A_ml = A(m,_,l);
+    auto B_nl = B(n,_,l);
+    ElementAccumulator accum = ElementAccumulator(0);
+    for (int k = 0; k < size<1>(A); ++k) {
+      ElementAccumulator a = a_converter(A_ml(k));
+      ElementAccumulator b = b_converter(B_nl(k));
+      accum += a * b;
+    }
+    ElementEpilogue scaled_output = (alpha * acc_converter(accum)) + (beta * source_converter(C(m,n,l)));
+    D(m,n,l) = output_converter(scaled_output);
+  }
+}
+// Most general version
+template <
+  class ProblemShapeMNKL,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class ElementAccumulator,
+  class ElementC,
+  class StrideC,
+  class ElementD,
+  class StrideD,
+  class ElementEpilogue>
+void
+gett(
+    ProblemShapeMNKL problem_shape_mnkl,
+    ElementA const* ptr_A, StrideA stride_a_mkl,
+    ElementB const* ptr_B, StrideB stride_b_nkl,
+    ElementAccumulator _,
+    ElementC const* ptr_C, StrideC stride_c_mnl,
+    ElementD      * ptr_D, StrideD stride_d_mnl,
+    ElementEpilogue alpha, ElementEpilogue beta,
+    cudaStream_t stream = 0) {
+  using namespace cute;
+  static_assert(cute::rank(ProblemShapeMNKL{}) == 4);
+  auto M = get<0>(problem_shape_mnkl);
+  auto N = get<1>(problem_shape_mnkl);
+  auto K = get<2>(problem_shape_mnkl);
+  auto L = get<3>(problem_shape_mnkl);
+  // Represent the full tensors
+  auto A = make_tensor(make_gmem_ptr(ptr_A), make_shape(M,K,L), stride_a_mkl); // (M,K,L)
+  auto B = make_tensor(make_gmem_ptr(ptr_B), make_shape(N,K,L), stride_b_nkl); // (N,K,L)
+  auto C = make_tensor(make_gmem_ptr(ptr_C), make_shape(M,N,L), stride_c_mnl); // (M,N,L)
+  auto D = make_tensor(make_gmem_ptr(ptr_D), make_shape(M,N,L), stride_d_mnl); // (M,N,L)
+  dim3 dimBlock(256);
+  dim3 dimGrid(240);
+  gett_kernel<<< dimGrid, dimBlock, 0, stream >>>(D, A, B, C, alpha, beta, ElementAccumulator(0));
+}
+} // namespace cutlass::reference::device

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/gemm.h ADDED Viewed

	@@ -0,0 +1,162 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/reference/device/thread/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace kernel {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename TensorRefA,
+  typename TensorRefB,
+  typename TensorRefC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename OutputTile,
+  typename InnerProductOp,
+  typename ConvertOp
+>
+__global__ void Gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRefA tensor_a,
+  TensorRefB tensor_b,
+  ScalarType beta,
+  TensorRefC tensor_c,
+  TensorRefC tensor_d,
+  AccumulatorType initial_accum) {
+  // Map each thread to a unique tile of the output matrix
+  MatrixCoord output_coord(
+    MatrixCoord::Index((threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kRow),
+    MatrixCoord::Index((threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kColumn)
+  );
+  // Compute the general matrix product
+  thread::Gemm<
+    TensorRefA,
+    TensorRefB,
+    TensorRefC,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  > gemm(initial_accum);
+  gemm.multiply_add(
+    problem_size,
+    tensor_a,
+    tensor_b,
+    output_coord);
+  gemm.epilogue(problem_size, alpha, beta, tensor_c, tensor_d, output_coord);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename OutputTile,
+  typename InnerProductOp,
+  typename ConvertOp
+>
+__global__ void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRefCollectionA tensor_collection_a,
+  TensorRefCollectionB tensor_collection_b,
+  ScalarType beta,
+  TensorRefCollectionC tensor_collection_c,
+  AccumulatorType initial_accum) {
+  // Obtain batch ID
+  int batch_id = blockIdx.z;
+  // Dereference based on batch_id
+  typename TensorRefCollectionA::TensorRef tensor_a = tensor_collection_a.at(batch_id);
+  typename TensorRefCollectionB::TensorRef tensor_b = tensor_collection_b.at(batch_id);
+  typename TensorRefCollectionC::TensorRef tensor_c = tensor_collection_c.at(batch_id);
+  // Map each thread to a unique tile of the output matrix
+  MatrixCoord output_coord(
+    (threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kColumn,
+    (threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kRow
+  );
+  // Compute the general matrix product
+  thread::Gemm<
+    typename TensorRefCollectionA::TensorRef,
+    typename TensorRefCollectionB::TensorRef,
+    typename TensorRefCollectionC::TensorRef,
+    ScalarType,
+    AccumulatorType,
+    OutputTile,
+    InnerProductOp,
+    ConvertOp
+  > gemm(initial_accum);
+  gemm.multiply_add(
+    problem_size,
+    tensor_a,
+    tensor_b,
+    output_coord);
+  gemm.epilogue(problem_size, alpha, beta, tensor_c, output_coord);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h ADDED Viewed

	@@ -0,0 +1,168 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include <curand_kernel.h>
+#include "cutlass/cutlass.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace kernel {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Kernel to initialize tensor to uniform random distribution
+template <typename T>
+__global__ void TensorInitializeUniform(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+  tensor += s_idx * ldm + c_idx;
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      double range = dist.uniform.max - dist.uniform.min;
+      double rnd = curand_uniform(&rng_state[threadIdx.x]);
+      rnd = dist.uniform.min + range * rnd;
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      if (dist.int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << dist.int_scale)));
+        *tensor = T(rnd / double(1 << dist.int_scale));
+      } else {
+        *tensor = T(rnd);
+      }
+      tensor += ldm;
+    }
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Kernel to initialize tensor to uniform distribution
+template <typename T>
+__global__ void TensorInitializeGaussian(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+  tensor += s_idx * ldm + c_idx;
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      double rnd = curand_normal(&rng_state[threadIdx.x]);
+      rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
+      if (dist.int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << dist.int_scale)));
+        *tensor = T(rnd / double(1 << dist.int_scale));
+      } else {
+        *tensor = T(rnd);
+      }
+    }
+  }
+}
+/// Kernel to initialize tensor to an identity matrix
+template <typename T>
+__global__ void TensorInitializeLinear(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+  tensor += s_idx * ldm + c_idx;
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      *tensor =
+          dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
+    }
+  }
+}
+/// Kernel to initialize tensor to an identity matrix
+template <typename T>
+__global__ void TensorInitializeIdentity(
+    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
+  __shared__ curandState_t rng_state[1024];
+  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
+  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
+  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int s_idx = blockIdx.y * blockDim.x;
+  tensor += s_idx * ldm + c_idx;
+  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
+    if (s_idx < dim_strided && c_idx < dim_contiguous) {
+      *tensor = (c_idx == s_idx ? T(1) : T(0));
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h ADDED Viewed

	@@ -0,0 +1,159 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/subbyte_reference.h"
+#include "cutlass/fast_math.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace kernel {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines several helpers
+namespace detail {
+/// Helper to perform for-each operation
+template <typename Func, int Rank, int RankRemaining>
+struct TensorForEachHelper {
+  /// Constructor for general rank
+  __inline__ __device__
+  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
+    int64_t product = 1;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = Rank - RankRemaining; i < Rank; ++i) {
+      product *= size[i];
+    }
+    coord[Rank - 1 - RankRemaining] = index / product;
+    int64_t remaining = index % product;
+    TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
+  }
+};
+/// Helper to perform for-each operation
+template <typename Func, int Rank>
+struct TensorForEachHelper<Func, Rank, 0> {
+  /// Constructor for fastest changing rank
+  __inline__ __device__
+  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
+    coord[Rank - 1] = index;
+    if (coord < size) {
+      func(coord);
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Kernel calls a functor for each element in a tensor's index space
+template <typename Func, int Rank, typename Params>
+__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
+  Func func(params);
+  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t max_index = 1;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    max_index *= size[i];
+  }
+  CUTLASS_PRAGMA_NO_UNROLL
+  while  (index < max_index) {
+    Coord<Rank> coord;
+    detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index);
+    index += blockDim.x * gridDim.x;
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Kernel calls a functor for each element along a tensor's diagonal
+template <typename Func, int Rank, typename Params>
+__global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start, int end) {
+  Func func(params);
+  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
+  if (index < end) {
+    Coord<Rank> coord;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Rank; ++i) {
+      coord[i] = index;
+    }
+    func(coord);
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element, typename Func>
+__global__ void BlockForEach(
+  Element *ptr,
+  size_t capacity,
+  typename Func::Params params) {
+  Func func(params);
+  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  for (; index < capacity; index += blockDim.x * gridDim.x) {
+    ReferenceFactory<Element>::get(ptr, index) = func();
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h ADDED Viewed

	@@ -0,0 +1,355 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in device-side code.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace kernel {
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  int kMblock = 4,
+  int kNblock = 4
+>
+__global__ void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  assert(M=N);
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
+  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock;
+  int batch_idx = blockIdx.z;
+  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
+  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
+  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
+  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
+  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
+    // Compute matrix product using blocks
+    ComputeType accum[kMblock][kNblock];
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        accum[i][j] = initial_accum;
+      }
+    }
+    for (int k_block = 0; k_block < K; ++k_block) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kNblock; j++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kMblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+          if (row < M && col < N &&
+             ( (fill_mode_c == FillMode::kLower && row >= col) ||
+              (fill_mode_c == FillMode::kUpper && row <= col) )
+            ) {
+            // A x B^T (Symmetric) or A x B^H (Hermitian)
+            // complex conjugation on operandB (b_t) is function of blas3 computation
+            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+            ElementB b_t = (blas_mode == BlasMode::kHermitian) ?
+                          conj(tensor_b.at(MatrixCoord(col, k_block))) :
+                          tensor_b.at(MatrixCoord(col, k_block));
+            ComputeType a_ik = ComputeType(a);
+            ComputeType b_jk = ComputeType(b_t);
+            // complex conjugation is a function of operand layouts
+            if (transform_a == ComplexTransform::kConjugate) {
+              a_ik = conj(a_ik);
+            }
+            // complex conjugation is a function of operand layouts
+            if (transform_b == ComplexTransform::kConjugate) {
+              b_jk = conj(b_jk);
+            }
+            accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+            // B x A^T (Symmetric) or B x A^H (Hermitian)
+            // complex conjugation on operandB (a_t) is function of blas3 computation
+            ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+            ElementA a_t = (blas_mode == BlasMode::kHermitian) ?
+                            conj(tensor_a.at(MatrixCoord(col, k_block))):
+                            tensor_a.at(MatrixCoord(col, k_block));
+            ComputeType b_ik = ComputeType(b);
+            ComputeType a_jk = ComputeType(a_t);
+            // complex conjugation here is a function of operand layouts
+            if (transform_b == ComplexTransform::kConjugate) {
+              b_ik = conj(b_ik);
+            }
+            // complex conjugation here is a function of operand layouts
+            if (transform_a == ComplexTransform::kConjugate) {
+              a_jk = conj(a_jk);
+            }
+            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+          }
+        }
+      }
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        int row = row_block + i;
+        int col = col_block + j;
+        MatrixCoord coord = MatrixCoord(row, col);
+        if (row < M && col < N &&
+            ((fill_mode_c == FillMode::kLower && row >= col) ||
+             (fill_mode_c == FillMode::kUpper && row <= col))
+          ) {
+          ScalarType c = tensor_c.at(coord);
+          // The imaginary parts of the diagonal elements of
+          // a complex data type are assumed and set to zero
+          if (blas_mode == BlasMode::kHermitian) {
+            c = (row == col) ? real(c) : c;
+          }
+          tensor_d.at(coord) = convert_op(
+            alpha * ScalarType(accum[i][j]) +
+            beta * c);
+        }
+      }
+    }
+    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
+    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
+    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
+    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
+  } // for (batch_idx)
+}
+} // namespace kernel
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  int const kMblock = 4;
+  int const kNblock = 4;
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
+    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
+    batch_count % std::numeric_limits<uint16_t>::max()
+  );
+  kernel::Rank2KComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ScalarType,
+    ComputeType,
+    ConvertOp,
+    InnerProductOp,
+    kMblock,
+    kNblock
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    transform_a,
+    tensor_b,
+    transform_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    initial_accum,
+    fill_mode_c,
+    blas_mode,
+    batch_count,
+    batch_stride_A,
+    batch_stride_B,
+    batch_stride_C,
+    batch_stride_D
+  );
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+  Rank2KComplex(
+    problem_size, alpha,
+    tensor_a, transform_a,
+    tensor_b, transform_b,
+    beta, tensor_c, tensor_d,
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_compare.h ADDED Viewed

	@@ -0,0 +1,250 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+#pragma once
+// Standard Library includes
+#include <utility>
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/relatively_equal.h"
+#include "cutlass/util/distribution.h"
+#include "tensor_foreach.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace kernel {
+template <typename Element>
+__global__ void BlockCompareEqual(
+  int *equal,
+  Element const *ptr_A,
+  Element const *ptr_B,
+  size_t capacity) {
+  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
+    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
+    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
+    if (a != b) {
+      *equal = 0;
+      return;
+    }
+  }
+}
+template <typename Element>
+__global__ void BlockCompareRelativelyEqual(
+  int *equal,
+  Element const *ptr_A,
+  Element const *ptr_B,
+  size_t capacity,
+  Element epsilon,
+  Element nonzero_floor) {
+  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
+    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
+    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
+    if (!relatively_equal(a, b, epsilon, nonzero_floor)) {
+      *equal = 0;
+      return;
+    }
+  }
+}
+} // namespace kernel
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Performs a bit-level equality check between two blocks
+template <typename Element>
+bool BlockCompareEqual(
+  Element const *ptr_A,
+  Element const *ptr_B,
+  size_t capacity,
+  int grid_size = 0,
+  int block_size = 0,
+  cudaStream_t stream = nullptr) {
+  int equal_flag = 1;
+  int *device_equal_flag = nullptr;
+  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
+    throw std::runtime_error("Failed to allocate device flag.");
+  }
+  if (cudaMemcpy(
+    device_equal_flag,
+    &equal_flag,
+    sizeof(int),
+    cudaMemcpyHostToDevice) != cudaSuccess) {
+    throw std::runtime_error("Failed to copy equality flag to device.");
+  }
+  if (!grid_size || !block_size) {
+    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+      &grid_size,
+      &block_size,
+      reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to query occupancy.");
+    }
+    // Limit block size. This has the effect of increasing the number of items processed by a
+    // single thread and reduces the impact of initialization overhead.
+    block_size = (block_size < 128 ? block_size : 128);
+  }
+  dim3 grid(grid_size, 1, 1);
+  dim3 block(block_size, 1, 1);
+  kernel::BlockCompareEqual<Element><<< grid, block, 0, stream >>>(device_equal_flag, ptr_A, ptr_B, capacity);
+  cudaStreamSynchronize(stream);
+  if (cudaMemcpy(
+    &equal_flag,
+    device_equal_flag,
+    sizeof(int),
+    cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(device_equal_flag);
+    throw std::runtime_error("Failed to copy equality flag from device.");
+  }
+  cudaFree(device_equal_flag);
+  return equal_flag;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Performs a bit-level equality check between two blocks
+template <typename Element>
+bool BlockCompareRelativelyEqual(
+  Element const *ptr_A,
+  Element const *ptr_B,
+  size_t capacity,
+  Element epsilon,
+  Element nonzero_floor,
+  int grid_size = 0,
+  int block_size = 0,
+  cudaStream_t stream = nullptr) {
+  int equal_flag = 1;
+  int *device_equal_flag = nullptr;
+  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
+    throw std::runtime_error("Failed to allocate device flag.");
+  }
+  if (cudaMemcpy(
+    device_equal_flag,
+    &equal_flag,
+    sizeof(int),
+    cudaMemcpyHostToDevice) != cudaSuccess) {
+    throw std::runtime_error("Failed to copy equality flag to device.");
+  }
+  if (!grid_size || !block_size) {
+    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+      &grid_size,
+      &block_size,
+      reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to query occupancy.");
+    }
+    // Limit block size. This has the effect of increasing the number of items processed by a
+    // single thread and reduces the impact of initialization overhead.
+    block_size = (block_size < 128 ? block_size : 128);
+  }
+  dim3 grid(grid_size, 1, 1);
+  dim3 block(block_size, 1, 1);
+  kernel::BlockCompareRelativelyEqual<Element><<< grid, block, 0, stream >>>(
+    device_equal_flag,
+    ptr_A,
+    ptr_B,
+    capacity,
+    epsilon,
+    nonzero_floor
+  );
+  cudaStreamSynchronize(stream);
+  if (cudaMemcpy(
+    &equal_flag,
+    device_equal_flag,
+    sizeof(int),
+    cudaMemcpyDeviceToHost) != cudaSuccess) {
+    cudaFree(device_equal_flag);
+    throw std::runtime_error("Failed to copy equality flag from device.");
+  }
+  cudaFree(device_equal_flag);
+  return equal_flag;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // device
+} // reference
+} // cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_fill.h ADDED Viewed

	@@ -0,0 +1,2075 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
+    in this header are not specialized for any particular data layout and are therefore not
+    intended to offer the best possible performance. Rather, they are intended to be generic
+    reference implementations to support the CUTLASS unit tests.
+*/
+#pragma once
+#if !defined(__CUDACC_RTC__)
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <type_traits>
+#include <cstdint>
+#endif
+// CUDA includes
+#include <curand_kernel.h>
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/util/reference/device/tensor_foreach.h"
+#include "cutlass/util/distribution.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reference {
+namespace device {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <typename FloatType>
+CUTLASS_DEVICE
+FloatType random_normal_float(curandState_t *state) {
+  return curand_normal(state);
+}
+template <>
+CUTLASS_DEVICE
+double random_normal_float<double>(curandState_t *state) {
+  return curand_normal_double(state);
+}
+template <typename FloatType>
+CUTLASS_DEVICE
+FloatType random_uniform_float(curandState_t *state) {
+  return curand_uniform(state);
+}
+template <>
+CUTLASS_DEVICE
+double random_uniform_float<double>(curandState_t *state) {
+  return curand_uniform_double(state);
+}
+template <typename Element>
+struct RandomGaussianFunc {
+  using FloatType = typename std::conditional<(sizeof(Element) > 4), double, float>::type;
+  using IntType = typename std::conditional<(sizeof(Element) > 4), int64_t, int>::type;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    uint64_t seed;
+    FloatType mean;
+    FloatType stddev;
+    int int_scale;
+    FloatType float_scale_up;
+    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0,
+      Element mean_ = 0,
+      Element stddev_ = 1,
+      int int_scale_ = -1,
+      int exclude_zero_ = -1
+    ):
+      seed(seed_),
+      mean(static_cast<FloatType>(mean_)),
+      stddev(static_cast<FloatType>(stddev_)),
+      int_scale(int_scale_),
+      exclude_zero(exclude_zero_) {
+      float_scale_up = FloatType(IntType(1) << int_scale); // scale up to clamp low order bits
+      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  /// RNG state object
+  curandState_t rng_state;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomGaussianFunc(Params const &params): params(params) {
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+    FloatType rnd = random_normal_float<FloatType>(&rng_state);
+    rnd = params.mean + params.stddev * rnd;
+    Element result;
+    if (params.int_scale >= 0) {
+      rnd = FloatType(std::llround(rnd * params.float_scale_up));
+      result = Element(rnd * params.float_scale_down);
+    }
+    else {
+      result = Element(rnd);
+    }
+    if (params.exclude_zero >=0 && result == Element(0.0)) {
+      if (rnd > FloatType(0)) {
+        rnd += FloatType(1);
+      } else {
+        rnd -= FloatType(1);
+      }
+      result = Element(rnd);
+    }
+    return result;
+  }
+};
+template <typename Real>
+struct RandomGaussianFunc<complex<Real>> {
+  using Element = complex<Real>;
+  using FloatType = typename std::conditional<(sizeof(Real) > 4), double, float>::type;
+  using IntType = typename std::conditional<(sizeof(Real) > 4), int64_t, int>::type;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    uint64_t seed;
+    FloatType mean;
+    FloatType stddev;
+    int int_scale;
+    FloatType float_scale_up;
+    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0,
+      Real mean_ = 0,
+      Real stddev_ = 1,
+      int int_scale_ = -1,
+      int exclude_zero_ = -1
+    ):
+      seed(seed_),
+      mean(static_cast<FloatType>(mean_)),
+      stddev(static_cast<FloatType>(stddev_)),
+      int_scale(int_scale_),
+      exclude_zero(exclude_zero_) {
+      float_scale_up = FloatType(IntType(1) << int_scale);
+      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  /// RNG state object
+  curandState_t rng_state;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomGaussianFunc(Params const &params): params(params) {
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+    FloatType rnd_r = random_normal_float<FloatType>(&rng_state);
+    FloatType rnd_i = random_normal_float<FloatType>(&rng_state);
+    rnd_r = params.mean + params.stddev * rnd_r;
+    rnd_i = params.mean + params.stddev * rnd_i;
+    Element result;
+    if (params.int_scale >= 0) {
+      rnd_r = FloatType(std::llround(rnd_r * params.float_scale_up));
+      rnd_i = FloatType(std::llround(rnd_i * params.float_scale_up));
+      result = {
+        Real(rnd_r * params.float_scale_down),
+        Real(rnd_i * params.float_scale_down)
+      };
+    }
+    else {
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+    if (params.exclude_zero >= 0 &&
+        result.real() == Real(0.0) &&
+        result.imag() == Real(0.0)) {
+      if (rnd_r > FloatType(0)) {
+        rnd_r += FloatType(1);
+      } else {
+        rnd_r -= FloatType(1);
+      }
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+    return result;
+  }
+};
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomGaussianFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  using RandomFunc = RandomGaussianFunc<Element>;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    typename RandomFunc::Params random;
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      typename RandomFunc::Params random_ = typename RandomFunc::Params()
+    ):
+      view(view_), random(random_) {
+    }
+  };
+  //
+  // Data members
+  //
+  Params params;
+  RandomFunc random;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillRandomGaussianFunc(Params const &params): params(params), random(params.random) {
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    params.view.at(coord) = random();
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomGaussian(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  typename RealType<Element>::Type mean = Element(0),   ///< Gaussian distribution's mean
+  typename RealType<Element>::Type stddev = Element(1), ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  int exclude_zero = -1,                  ///< If non-negative, excludes zeros from tensor init
+  cudaStream_t stream = nullptr) {
+  using RandomFunc = detail::RandomGaussianFunc<Element>;
+  using Func = detail::TensorFillRandomGaussianFunc<Element, Layout>;
+  using Params = typename Func::Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a Gaussian distribution.
+template <typename Element>               ///< Element type
+void BlockFillRandomGaussian(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                              ///< seed for RNG
+  typename RealType<Element>::Type mean,      ///< Gaussian distribution's mean
+  typename RealType<Element>::Type stddev,    ///< Gaussian distribution's standard deviation
+  int bits = -1,                              ///< If non-negative, specifies number of fractional bits that
+                                              ///  are not truncated to zero. Permits reducing precision of
+                                              ///  data.
+  cudaStream_t stream = nullptr) {
+  using RandomFunc = detail::RandomGaussianFunc<Element>;
+  typename RandomFunc::Params params(seed, mean, stddev, bits);
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Computes a random uniform distribution
+template <typename Element>                ///< Element type
+struct RandomUniformFunc {
+  using FloatType = typename std::conditional<
+    (sizeof(Element) > 4),
+    double,
+    float>::type;
+  using IntType = typename std::conditional<
+    (sizeof(Element) > 4),
+    int64_t,
+    int>::type;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    uint64_t seed;
+    FloatType range;
+    FloatType max;
+    int int_scale;
+    double pnan;
+    FloatType float_scale_up;
+    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0,
+      Element max_ = 1,
+      Element min = 0,
+      int int_scale_ = -1,
+      double pnan_ = 0,
+      int exclude_zero_ = -1
+    ):
+      seed(seed_),
+      range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)),
+      max(static_cast<FloatType>(max_)),
+      int_scale(int_scale_),
+      pnan(pnan_),
+      exclude_zero(exclude_zero_) {
+      float_scale_up = FloatType(IntType(1) << int_scale); // scale up to clamp low order bits
+      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero >= 0) {
+        range = (min == Element(0)) ? range - FloatType(1): range;
+        max = (max_ == Element(0)) ? max - FloatType(1): max;
+      }
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  /// RNG state object
+  curandState_t rng_state;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomUniformFunc(Params const &params): params(params) {
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
+        return Element(NAN);
+      }
+    }
+    FloatType rnd = random_uniform_float<FloatType>(&rng_state);
+    rnd = params.max - params.range * rnd;
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+    if (params.int_scale >= 0) {
+      rnd = FloatType(std::llround(rnd * params.float_scale_up));
+      result = Element(rnd * params.float_scale_down);
+    }
+    else {
+      result = Element(rnd);
+    }
+    if (params.exclude_zero >=0 && result == Element(0.0)) {
+      if (rnd > FloatType(0)) {
+        rnd = std::min(params.max, rnd + FloatType(1));
+      } else {
+        rnd = std::max((params.max - params.range), rnd - FloatType(1));
+      }
+      result = Element(rnd);
+    }
+    return result;
+  }
+};
+/// Computes a random Gaussian distribution
+template <typename Real>
+struct RandomUniformFunc<complex<Real>> {
+  using Element = complex<Real>;
+  using FloatType = typename std::conditional<
+    (sizeof(Real) > 4),
+    double,
+    float>::type;
+  using IntType = typename std::conditional<
+    (sizeof(Real) > 4),
+    int64_t,
+    int>::type;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    uint64_t seed;
+    FloatType range;
+    FloatType min;
+    int int_scale;
+    double pnan;
+    FloatType float_scale_up;
+    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0,
+      FloatType max = 1,
+      FloatType min_ = 0,
+      int int_scale_ = -1,
+      double pnan_ = 0,
+      int exclude_zero_ = -1
+    ):
+      seed(seed_),
+      range(static_cast<FloatType>(max - min_)),
+      min(static_cast<FloatType>(min_)),
+      int_scale(int_scale_),
+      pnan(pnan_),
+      exclude_zero(exclude_zero_) {
+      float_scale_up = FloatType(IntType(1) << int_scale);
+      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero >= 0) {
+        min = (min == FloatType(0)) ? min + FloatType(1): min;
+        range = (max == FloatType(0)) ? range - FloatType(1): range;
+      }
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  /// RNG state object
+  curandState_t rng_state;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomUniformFunc(Params const &params): params(params) {
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
+        return Element(Real(NAN), Real(NAN));
+      }
+    }
+    FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
+    FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);
+    rnd_r = params.min + params.range * rnd_r;
+    rnd_i = params.min + params.range * rnd_i;
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+    if (params.int_scale >= 0) {
+      rnd_r = FloatType(std::llround(rnd_r * params.float_scale_up));
+      rnd_i = FloatType(std::llround(rnd_i * params.float_scale_up));
+      result = {
+        Real(rnd_r * params.float_scale_down),
+        Real(rnd_i * params.float_scale_down)
+      };
+    }
+    else {
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+    if (params.exclude_zero >= 0 &&
+        result.real() == Real(0.0) &&
+        result.imag() == Real(0.0)) {
+      if (rnd_r > FloatType(0)) {
+        rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
+      } else {
+        rnd_r = std::max((params.min), rnd_r - FloatType(1));
+      }
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+    return result;
+  }
+};
+/// Computes a random uniform distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomUniformFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  using RandomFunc = RandomUniformFunc<Element>;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    typename RandomFunc::Params random;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      typename RandomFunc::Params random_ = RandomFunc::Params()
+    ):
+      view(view_), random(random_) {
+    }
+  };
+  //
+  // Data members
+  //
+  Params params;
+  RandomFunc random;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillRandomUniformFunc(Params const &params): params(params), random(params.random) {
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    params.view.at(coord) = random();
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  typename RealType<Element>::Type max = Element(1), ///< upper bound of distribution
+  typename RealType<Element>::Type min = Element(0), ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  int exclude_zero = -1,               ///< If non-negative, excludes zeros from tensor init
+  cudaStream_t stream = nullptr) {
+  using RandomFunc = detail::RandomUniformFunc<Element>;
+  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
+  using Params = typename Func::Params;
+  typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, random),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a uniform random distribution.
+template <typename Element>
+void BlockFillRandomUniform(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  typename RealType<Element>::Type max,   ///< upper bound of distribution
+  typename RealType<Element>::Type min,   ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  cudaStream_t stream = nullptr) {
+  using RandomFunc = detail::RandomUniformFunc<Element>;
+  typename RandomFunc::Params params(seed, max, min, bits, pnan);
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Computes a random sparse meta
+template <typename Element>               ///< Element type
+struct RandomSparseMetaFunc {
+  using FloatType = float;
+  using IntType = int32_t;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    uint64_t seed;
+    FloatType range;
+    int MetaSizeInBits;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      uint64_t seed_ = 0,
+      int MetaSizeInBits_ = 2
+    ):
+      seed(seed_),
+      MetaSizeInBits(MetaSizeInBits_) {
+      if (MetaSizeInBits_ == 2) {
+        range = 6;
+      }
+      else if (MetaSizeInBits_ == 4) {
+        range = 2;
+      }
+      else {
+        throw std::invalid_argument("Invalid MetaSizeInBits");
+      }
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  /// RNG state object
+  curandState_t rng_state;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  RandomSparseMetaFunc(Params const &params): params(params) {
+    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
+    curand_init(params.seed, gtid, 0, &rng_state);
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  Element operator()() {
+    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
+    Element TwoToOneMeta[2] = {0x4, 0xe};
+    Element *MetaArray =
+        (params.MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
+    Element result = 0x0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
+      FloatType rnd = random_uniform_float<FloatType>(&rng_state);
+      rnd = params.range * rnd;
+      Element meta = MetaArray[(int)rnd];
+      result = (Element)(result | ((Element)(meta << (i * 4))));
+    }
+    return result;
+  }
+};
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomSparseMetaFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  using RandomFunc = RandomSparseMetaFunc<Element>;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    typename RandomFunc::Params random;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      typename RandomFunc::Params random_ = RandomFunc::Params()
+    ):
+      view(view_), random(random_) {
+    }
+  };
+  //
+  // Data members
+  //
+  Params params;
+  RandomFunc random;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillRandomSparseMetaFunc(Params const &params): params(params), random(params.random) {
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    params.view.at(coord) = random();
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomSparseMeta(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  int MetaSizeInBits = 2,                 ///< meta data size
+  cudaStream_t stream = nullptr) {
+  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
+  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
+  using Params = typename Func::Params;
+  typename RandomFunc::Params random(seed, MetaSizeInBits);
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, random),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a uniform random distribution.
+template <typename Element>
+void BlockFillRandomSparseMeta(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  int MetaSizeInBits = 2,                 ///< meta data size
+  cudaStream_t stream = nullptr) {
+  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
+  typename RandomFunc::Params params(seed, MetaSizeInBits);
+  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Functor to fill a tensor with zeros off the diagonal and a uniform value on the diagonal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillDiagonalFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    Element diag;
+    Element other;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    Params(
+      TensorView view_ = TensorView(),
+      Element diag_ = Element(1),
+      Element other_ = Element(0)
+    ):
+      view(view_), diag(diag_), other(other_) {
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillDiagonalFunc(Params const &params): params(params) {
+  }
+  /// Updates the tensor
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool is_diag = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+    params.view.at(coord) = (is_diag ? params.diag : params.other);
+  }
+};
+// Overwrites the elements of a tensor with a uniform value depending on fill mode
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillPartialFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    Element element;
+    FillMode fill_mode;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): fill_mode(FillMode::kNone) { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,
+      Element element_,
+      FillMode fill_mode_
+    ):
+      view(view_), element(element_), fill_mode(fill_mode_) {
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  TensorFillPartialFunc(Params const &params): params(params) {
+  }
+  /// Overwrites the element if it is within the covered region.
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool predicate = true;
+    switch (params.fill_mode) {
+    case FillMode::kFull:
+      predicate = true;
+      break;
+    case FillMode::kLower:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] < coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+    case FillMode::kUpper:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] > coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+    case FillMode::kDiagonal:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] != coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+    case FillMode::kNone: // fall-through
+    default:
+      predicate = false;
+      break;
+    }
+    if (predicate) {
+      params.view.at(coord) = params.element;
+    }
+  }
+};
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorClearPartialFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  ///
+  static_assert((Layout::kRank == 2), "TensorClearPartial is only supported for matrices");
+  /// Parameters structure
+  struct Params {
+    TensorView view{};
+    Element element{};
+    FillMode fill_mode{FillMode::kNone};
+    int alignment{0};
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  TensorClearPartialFunc(Params const &params): params(params) {
+  }
+  /// Overwrites the element if it is within the covered region.
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool predicate = true;
+    switch (params.fill_mode) {
+    case FillMode::kLower:
+      if ((coord[0] >= coord[1]) ||
+          ((coord[1] - coord[0]) >= params.alignment))  {
+          predicate = false;
+        break;
+      }
+      break;
+    case FillMode::kUpper:
+      if ((coord[0] <= coord[1]) ||
+          ((coord[0] - coord[1]) >= params.alignment))  {
+          predicate = false;
+        break;
+      }
+      break;
+    case FillMode::kNone: // fall-through
+    default:
+      predicate = false;
+      break;
+    }
+    if (predicate) {
+      params.view.at(coord) = params.element;
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor everywhere with a unique value for its diagonal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillDiagonal(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element diag = Element(1),              ///< value to write in the diagonal
+  Element other = Element(0),             ///< value to write off the diagonal
+  cudaStream_t stream = nullptr) {
+  typedef detail::TensorFillDiagonalFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, diag, other),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+/// Fills a tensor partially depending on fill mode. Elements not covered by the fillmode are
+/// not written.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillPartial(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element element,
+  FillMode fill_mode,
+  cudaStream_t stream = nullptr) {
+  typedef detail::TensorFillPartialFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, element, fill_mode),
+    stream
+  );
+}
+/// Clears a tensor partially depending on fill mode and alignment. Elements on the wrong-side
+/// of fillmode (upto the alignment) are overwritten with the user supplied element (typically zeros)
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorClearPartial(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element element,
+  FillMode fill_mode,
+  int alignment,
+  cudaStream_t stream = nullptr) {
+  typedef detail::TensorClearPartialFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params{view, element, fill_mode, alignment},
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with a uniform value
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFill(
+  TensorView<Element, Layout> view,         ///< destination tensor
+  Element val = Element(0),                 ///< value to uniformly fill it with
+  cudaStream_t stream = nullptr) {
+  TensorFillDiagonal(view, val, val, stream);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor's diagonal with 1 and 0 everywhere else.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillIdentity(
+  TensorView<Element, Layout> view,                 ///< destination tensor
+  cudaStream_t stream = nullptr) {
+  TensorFillDiagonal(view, Element(1), Element(0), stream);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorUpdateDiagonalFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    Element diag;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      Element diag_ = Element(1)
+    ):
+      view(view_), diag(diag_) {
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorUpdateDiagonalFunc(Params const &params): params(params) {
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool is_diag = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+    if (is_diag) {
+      params.view.at(coord) = params.diag;
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorUpdateDiagonal(
+  TensorView<Element, Layout> view,                 ///< destination tensor
+  Element diag = Element(1),
+  cudaStream_t stream = nullptr) {
+  typedef detail::TensorUpdateDiagonalFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, diag),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorUpdateOffDiagonalFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    Element other;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_ = TensorView(),
+      Element other_ = Element(0)
+    ):
+      view(view_), other(other_) {
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorUpdateOffDiagonalFunc(Params const &params): params(params) {
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool is_diag = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+    if (!is_diag) {
+      params.view.at(coord) = params.other;
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorUpdateOffDiagonal(
+  TensorView<Element, Layout> view,      ///< destination tensor
+  Element other = Element(1),
+  cudaStream_t stream = nullptr) {
+  typedef detail::TensorUpdateOffDiagonalFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, other),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillLinearFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    Array<Element, Layout::kRank> v;
+    Element s;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,      ///< destination tensor
+      Array<Element, Layout::kRank> const & v_,
+      Element s_ = Element(0)
+    ):
+      view(view_), v(v_), s(s_) {
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorFillLinearFunc(Params const &params): params(params) {
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    Element sum = params.s;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank; ++i) {
+      if constexpr (is_complex<Element>::value) {
+        if constexpr (sizeof_bits<Element>::value <= 32) {
+          sum = Element(static_cast<complex<float>>(sum) +
+                  static_cast<complex<float>>(params.v[i]) * static_cast<complex<float>>(coord[i]));
+        }
+      }
+      else if constexpr (sizeof_bits<Element>::value <= 32) {
+        if constexpr (std::numeric_limits<Element>::is_integer) {
+          sum = Element(static_cast<int32_t>(sum) +
+                  static_cast<int32_t>(params.v[i]) * static_cast<int32_t>(coord[i]));
+        }
+        else {
+          sum = Element(static_cast<float>(sum) +
+                  static_cast<float>(params.v[i]) * static_cast<float>(coord[i]));
+        }
+      }
+      else {
+        sum += params.v[i] * coord[i];
+      }
+    }
+    params.view.at(coord) = sum;
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills tensor with a linear combination of its coordinate and another vector
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillLinear(
+  TensorView<Element, Layout> view,      ///< destination tensor
+  Array<Element, Layout::kRank> const & v,
+  Element s = Element(0),
+  cudaStream_t stream = nullptr) {
+  using Func = detail::TensorFillLinearFunc<Element, Layout>;
+  using Params = typename Func::Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, v, s),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values from a distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandom(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,
+  Distribution dist,
+  cudaStream_t stream = nullptr,
+  int exclude_zero = -1                   ///< If non-negative, excludes 0.
+                                          ///  Note that setting this flag will result in more 1's,
+                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
+  ) {
+  using Real = typename RealType<Element>::Type;
+  if (dist.kind == Distribution::Gaussian) {
+    TensorFillRandomGaussian<Element, Layout>(
+      view,
+      seed,
+      static_cast<Real>(dist.gaussian.mean),
+      static_cast<Real>(dist.gaussian.stddev),
+      dist.int_scale,
+      exclude_zero,
+      stream);
+  } else if (dist.kind == Distribution::Uniform) {
+    TensorFillRandomUniform<Element, Layout>(
+      view,
+      seed,
+      static_cast<Real>(dist.uniform.max),
+      static_cast<Real>(dist.uniform.min),
+      dist.int_scale,
+      dist.uniform.pnan,
+      exclude_zero,
+      stream);
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequential(
+  Element *ptr,
+  int64_t capacity,
+  Element v = Element(1),
+  Element s = Element(0)) {
+  using Layout = layout::PackedVectorLayout;
+  Layout::TensorCoord size(static_cast<Layout::Index>(capacity)); // -Wconversion
+  Layout layout = Layout::packed(size);
+  TensorView<Element, Layout> view(ptr, layout, size);
+  Array<Element, Layout::kRank> c{};
+  c[0] = v;
+  TensorFillLinear(view, c, s);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillRandom(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,
+  Distribution dist,
+  cudaStream_t stream = nullptr) {
+  using Real = typename RealType<Element>::Type;
+  if (dist.kind == Distribution::Gaussian) {
+    BlockFillRandomGaussian<Element>(
+      ptr,
+      capacity,
+      seed,
+      static_cast<Real>(dist.gaussian.mean),
+      static_cast<Real>(dist.gaussian.stddev),
+      dist.int_scale,
+      stream);
+  }
+  else if (dist.kind == Distribution::Uniform) {
+    BlockFillRandomUniform<Element>(
+      ptr,
+      capacity,
+      seed,
+      static_cast<Real>(dist.uniform.max),
+      static_cast<Real>(dist.uniform.min),
+      dist.int_scale,
+      dist.uniform.pnan,
+      stream);
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorCopyDiagonalInFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    Element const *ptr;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,      ///< destination tensor
+      Element const *ptr_
+    ):
+      view(view_), ptr(ptr_) {
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorCopyDiagonalInFunc(Params const &params): params(params) {
+  }
+  /// Only update the diagonal element
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool is_diagonal = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[0]) {
+        is_diagonal = false;
+      }
+    }
+    if (is_diagonal) {
+      params.view.at(coord) = params.ptr[coord[0]];
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies a diagonal in from host memory without modifying off-diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorCopyDiagonalIn(
+  TensorView<Element, Layout> view,   ///< destination tensor
+  Element const *ptr,                        ///< dense buffer of elements
+  cudaStream_t stream = nullptr) {
+  using Func = detail::TensorCopyDiagonalInFunc<Element, Layout>;
+  using Params = typename Func::Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, ptr),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorCopyDiagonalOutFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Scalar type
+  typedef typename TensorView::Element T;
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    Element *ptr;
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    //
+    // Methods
+    //
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,      ///< destination tensor
+      Element *ptr_
+    ):
+      view(view_), ptr(ptr_) {
+    }
+  };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  /// Device-side initialization of RNG
+  CUTLASS_DEVICE
+  TensorCopyDiagonalOutFunc(Params const &params): params(params) {
+  }
+  /// Compute random value and update RNG state
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    bool is_diagonal = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[0]) {
+        is_diagonal = false;
+      }
+    }
+    if (is_diagonal) {
+      params.ptr[coord[0]] = params.view.at(coord);
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies the diagonal of a tensor into a dense buffer in host memory.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorCopyDiagonalOut(
+  Element *ptr,                               ///< dense buffer of elements
+  TensorView<Element, Layout> view,      ///< source tensor
+  cudaStream_t stream = nullptr) {
+  using Func = detail::TensorCopyDiagonalOutFunc<Element, Layout>;
+  using Params = typename Func::Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, ptr),
+    /*grid_size*/0, /*block_size*/0,
+    stream
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_foreach.h ADDED Viewed

	@@ -0,0 +1,142 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/reference/device/kernel/tensor_foreach.h"
+namespace cutlass  {
+namespace reference {
+namespace device {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Launches a kernel calling a functor for each element in a tensor's index space.
+template <typename Func, int Rank, typename Params>
+struct TensorForEach {
+  /// Constructor performs the operation.
+  TensorForEach(
+    Coord<Rank> size, Params params = Params(),
+    int grid_size = 0, int block_size = 0,
+    cudaStream_t stream = nullptr) {
+    if (!grid_size || !block_size) {
+      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+        &grid_size,
+        &block_size,
+        reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
+      if (result != cudaSuccess) {
+        throw std::runtime_error("Failed to query occupancy.");
+      }
+      // Limit block size. This has the effect of increasing the number of items processed by a
+      // single thread and reduces the impact of initialization overhead.
+      block_size = (block_size < 128 ? block_size : 128);
+    }
+    dim3 grid(grid_size, 1, 1);
+    dim3 block(block_size, 1, 1);
+    kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Launches a kernel calling a functor for each element along a tensor's diagonal
+template <typename Func, int Rank, typename Params>
+struct TensorDiagonalForEach {
+  /// Constructor performs the operation
+  TensorDiagonalForEach(
+    Coord<Rank> size, Params params = Params(),
+    int start = 0, int end = -1,
+    int block_size = 128, cudaStream_t stream = nullptr) {
+    if (end < 0) {
+      end = size.min();
+    }
+    dim3 block(block_size, 1, 1);
+    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
+    kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
+      size, params, start, end);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element, typename Func>
+struct BlockForEach {
+  /// Constructor performs the operation.
+  BlockForEach(
+    Element *ptr,
+    size_t capacity,
+    typename Func::Params params = typename Func::Params(),
+    int grid_size = 0,
+    int block_size = 0,
+    cudaStream_t stream = nullptr) {
+    if (!grid_size || !block_size) {
+      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
+      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
+        &grid_size,
+        &block_size,
+        reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
+      if (result != cudaSuccess) {
+        throw std::runtime_error("Failed to query occupancy.");
+      }
+      // Limit block size. This has the effect of increasing the number of items processed by a
+      // single thread and reduces the impact of initialization overhead.
+      block_size = (block_size < 128 ? block_size : 128);
+    }
+    dim3 grid(grid_size, 1, 1);
+    dim3 block(block_size, 1, 1);
+    kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_reduce.h ADDED Viewed

	@@ -0,0 +1,514 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include <cmath>
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/detail/linear_to_coordinate.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reference {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace kernel {
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp,
+  int kBlockSize = 128
+>
+__global__ void TensorTransformReducePartial(
+  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
+  ComputeType identity,                 /// Identity element of the reduction operation
+  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
+  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
+  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
+  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t size = view.size();
+  __shared__ ComputeType scratchpad[kBlockSize];
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    // Map linear thread ID onto tensor coordinate
+    typename Layout::TensorCoord coord;
+    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
+    if (view.contains(coord)) {
+      // Fetch element
+      Element x = view.at(coord);
+      // Transform
+      identity = reduce(identity, transform(x));
+    }
+  }
+  scratchpad[threadIdx.x] = identity;
+  __syncthreads();
+  // One thread performs the final reduction and stores out. This could be enhanced via
+  // a tree reduction and pipelining.
+  if (threadIdx.x == 0) {
+    for (int i = 1; i < kBlockSize; ++i) {
+      identity = reduce(identity, scratchpad[i]);
+    }
+    workspace[blockIdx.x] = identity;
+  }
+}
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp,
+  int kBlockSize = 128
+>
+__global__ void TensorTransformReducePartial(
+  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
+  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
+  ComputeType identity,                 /// Identity element of the reduction operation
+  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
+  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
+  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
+  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+  auto size = static_cast<int64_t>(view_A.size());
+  __shared__ ComputeType scratchpad[kBlockSize];
+  for (; idx < size; idx += blockDim.x * gridDim.x) {
+    // Map linear thread ID onto tensor coordinate
+    typename Layout::TensorCoord coord;
+    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
+    if (view_A.contains(coord)) {
+      // Fetch element
+      Element a = view_A.at(coord);
+      Element b = view_B.at(coord);
+      // Transform
+      identity = reduce(identity, transform(a, b));
+    }
+  }
+  scratchpad[threadIdx.x] = identity;
+  __syncthreads();
+  // One thread performs the final reduction and stores out. This could be enhanced via
+  // a tree reduction and pipelining.
+  if (threadIdx.x == 0) {
+    for (int i = 1; i < kBlockSize; ++i) {
+      identity = reduce(identity, scratchpad[i]);
+    }
+    workspace[blockIdx.x] = identity;
+  }
+}
+template <
+  typename ComputeType,
+  typename ReduceOp,
+  int kBlockSize = 32
+>
+__global__ void TensorTransformReduceFinalize(
+  ComputeType *workspace,
+  ComputeType identity,
+  int workspace_size,
+  ReduceOp reduce) {
+  __shared__ ComputeType scratchpad[kBlockSize];
+  for (int idx = threadIdx.x; idx < workspace_size; idx += kBlockSize) {
+    identity = reduce(identity, workspace[idx]);
+  }
+  scratchpad[threadIdx.x] = identity;
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    for (int i = 1; i < kBlockSize; ++i) {
+      identity = reduce(identity, scratchpad[i]);
+    }
+    workspace[0] = identity;
+  }
+}
+} // namespace kernel
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Transform-reduce operation over the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
+  ComputeType identity,                 /// Identity element of the reduction operation
+  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
+  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
+  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
+  int workspace_size,                   /// Number of elements in workspace
+  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
+  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
+) {
+  int const kBlockSize = 128;
+  dim3 block(kBlockSize, 1);
+  dim3 grid(workspace_size, 1);
+  kernel::TensorTransformReducePartial<
+    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
+  ><<< grid, block, 0, stream >>>(
+    view, identity, reduce, transform, workspace
+  );
+  int const kFinalizeBlockSize = 32;
+  kernel::TensorTransformReduceFinalize<
+    ComputeType, ReduceOp, kFinalizeBlockSize
+  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
+    workspace, identity, workspace_size, reduce
+  );
+  cudaStreamSynchronize(stream);
+  if (copy_out) {
+    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaMemcpy() failed");
+    }
+  }
+  return identity;
+}
+/// Transform-reduce operation over the elements of two tensors, zipped together
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
+  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
+  ComputeType identity,                 /// Identity element of the reduction operation
+  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
+  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
+  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
+  int workspace_size,                   /// Number of elements in workspace
+  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
+  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
+) {
+  if (view_A.extent() != view_B.extent()) {
+    throw std::runtime_error("Extents must be equal.");
+  }
+  int const kBlockSize = 128;
+  dim3 block(kBlockSize, 1);
+  dim3 grid(workspace_size, 1);
+  kernel::TensorTransformReducePartial<
+    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
+  ><<< grid, block, 0, stream >>>(
+    view_A, view_B, identity, reduce, transform, workspace
+  );
+  int const kFinalizeBlockSize = 32;
+  kernel::TensorTransformReduceFinalize<
+    ComputeType, ReduceOp, kFinalizeBlockSize
+  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
+    workspace, identity, workspace_size, reduce
+  );
+  cudaStreamSynchronize(stream);
+  if (copy_out) {
+    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaMemcpy() failed");
+    }
+  }
+  return identity;
+}
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform,
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+  // Optionally query for the SM count to size the workspace.
+  if (!workspace_size) {
+    int device_idx = 0;
+    cudaDeviceProp prop;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() failed");
+    }
+    result = cudaGetDeviceProperties(&prop, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProp() failed");
+    }
+    workspace_size = int(prop.multiProcessorCount);
+  }
+  DeviceAllocation<ComputeType> workspace(workspace_size);
+  ComputeType output = TensorTransformReduce(
+    view,
+    identity,
+    reduce,
+    transform,
+    workspace.get(),
+    workspace_size,
+    stream,
+    true);
+  return output;
+}
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform,
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+  // Optionally query for the SM count to size the workspace.
+  if (!workspace_size) {
+    int device_idx = 0;
+    cudaDeviceProp prop;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() failed");
+    }
+    result = cudaGetDeviceProperties(&prop, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProp() failed");
+    }
+    workspace_size = int(prop.multiProcessorCount);
+  }
+  DeviceAllocation<ComputeType> workspace(workspace_size);
+  ComputeType output = TensorTransformReduce(
+    view_A,
+    view_B,
+    identity,
+    reduce,
+    transform,
+    workspace.get(),
+    workspace_size,
+    stream,
+    true);
+  return output;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper to compute the sum of the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = Element
+>
+ComputeType TensorSum(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+  plus<ComputeType> reduce;
+  NumericConverter<ComputeType, Element> transform;
+  return TensorTransformReduce(
+    view, identity, reduce, transform, stream, workspace_size);
+}
+/// Helper to compute the sum of the squares of the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = Element
+>
+ComputeType TensorSumSq(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+  plus<ComputeType> reduce;
+  magnitude_squared<Element, ComputeType> transform;
+  return TensorTransformReduce(
+    view, identity, reduce, transform, stream, workspace_size);
+}
+/// Helper to compute the norm of the elements of a tensor.
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorNorm(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+  return std::sqrt(TensorSumSq(view, identity, stream, workspace_size));
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper to compute the sum of the squares of the differences of two tensors
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorSumSqDiff(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+  plus<ComputeType> reduce;
+  magnitude_squared_difference<Element, ComputeType> transform;
+  return TensorTransformReduce(
+    view_A, view_B, identity, reduce, transform, stream, workspace_size);
+}
+/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorNormDiff(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity = ComputeType(),
+  cudaStream_t stream = nullptr,
+  int workspace_size = 0
+) {
+  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity, stream, workspace_size));
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reference
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_relu.h ADDED Viewed

	@@ -0,0 +1,141 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
+    in this header are not specialized for any particular data layout and are therefore not
+    intended to offer the best possible performance. Rather, they are intended to be generic
+    reference implementations to support the CUTLASS unit tests.
+*/
+#pragma once
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/util/reference/device/tensor_foreach.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reference {
+namespace device {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorReLuFunc {
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+  /// Coordinate in tensor's index space
+  using TensorCoord = typename TensorView::TensorCoord;
+  /// Parameters structure
+  struct Params {
+    //
+    // Data members
+    //
+    TensorView view;
+    Element threshold;
+    //
+    // Methods
+    //
+    Params(
+      TensorView view_ = TensorView(),
+      Element threshold_ = Element(0)
+    ):
+      view(view_), threshold(threshold_) {
+    }
+  };
+  //
+  // Data members
+  //
+  Params params;
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  TensorReLuFunc(Params const &params): params(params) {
+  }
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+    Element const & value = params.view.at(coord);
+    params.view.at(coord) = (value < params.threshold) ? params.threshold : value;
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Apply ReLu on a tensor
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorReLu(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element threshold = Element(0)) {         ///< ReLu threshold
+  using Func = detail::TensorReLuFunc<Element, Layout>;
+  using Params = typename Func::Params;
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, threshold)
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/thread/gemm.h ADDED Viewed

	@@ -0,0 +1,186 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace device {
+namespace thread {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Thread-level blocked general matrix product.
+//
+// Note, this is a reference implementation. Performance is not expected to approach peak.
+//
+template <
+  typename TensorRefA,
+  typename TensorRefB,
+  typename TensorRefC,
+  typename ScalarType,
+  typename AccumulatorType,
+  typename OutputTile,
+  typename InnerProductOp = multiply_add<AccumulatorType>,
+  typename ConvertOp = NumericConverter<typename TensorRefC::Element, ScalarType>
+>
+struct Gemm {
+  using ElementA = typename TensorRefA::Element;
+  using ElementB = typename TensorRefB::Element;
+  using ElementC = typename TensorRefC::Element;
+  //
+  // Data members
+  //
+  /// Tile for A operand
+  ElementA A_tile[OutputTile::kColumn];
+  /// Tile for B operand
+  ElementB B_tile[OutputTile::kRow];
+  /// Tile for Accumulator
+  AccumulatorType accum[OutputTile::kColumn][OutputTile::kRow];
+  //
+  // Methods
+  //
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Gemm(AccumulatorType initial_accum = AccumulatorType(0)) {
+    // Clear fetch registers
+    for (int i = 0; i < OutputTile::kColumn; ++i) {
+      A_tile[i] = ElementA(0);
+    }
+    for (int j = 0; j < OutputTile::kRow; ++j) {
+      B_tile[j] = ElementB(0);
+    }
+    // Clear accumulators
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < OutputTile::kColumn; ++j) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < OutputTile::kRow; ++i) {
+        accum[j][i] = initial_accum;
+      }
+    }
+  }
+  /// Computes a matrix product
+  CUTLASS_HOST_DEVICE
+  Gemm & multiply_add(
+    gemm::GemmCoord problem_size,
+    TensorRefA tensor_a,
+    TensorRefB tensor_b,
+    MatrixCoord output_coord = MatrixCoord()) {
+    InnerProductOp inner_product_op;
+    // Loop over the GEMM K dimension
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int k = 0; k < problem_size.k(); ++k) {
+      // Fetch a slice of the A matrix
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < OutputTile::kColumn; ++i) {
+        if (output_coord.row() + i < problem_size.m()) {
+          A_tile[i] = tensor_a.at(make_Coord(output_coord.row() + i, k));
+        }
+      }
+      // Fetch a slice of the B matrix
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < OutputTile::kRow; ++j) {
+        if (output_coord.column() + j < problem_size.n()) {
+          B_tile[j] = tensor_b.at(make_Coord(k, output_coord.column() + j));
+        }
+      }
+      // Compute an accumulated matrix product
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < OutputTile::kRow; ++j) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < OutputTile::kColumn; ++i) {
+          accum[j][i] = inner_product_op(A_tile[i], B_tile[j], accum[j][i]);
+        }
+      }
+    }
+    return *this;
+  }
+  /// Performs linear scaling of matrix product and updates output tensor
+  CUTLASS_HOST_DEVICE
+  Gemm & epilogue(
+    gemm::GemmCoord problem_size,
+    ScalarType alpha,
+    ScalarType beta,
+    TensorRefC tensor_c,
+    TensorRefC tensor_d,
+    MatrixCoord output_coord = MatrixCoord()) {
+    ConvertOp convert_op;
+    // Update the output tensor
+    for (int j = 0; j < OutputTile::kRow; ++j) {
+      for (int i = 0; i < OutputTile::kColumn; ++i) {
+        MatrixCoord coord = output_coord + MatrixCoord(i, j);
+        if (coord.row() < problem_size.m() && coord.column() < problem_size.n()) {
+          tensor_d.at(coord) = convert_op(
+            alpha * ScalarType(accum[j][i]) +
+            beta * ScalarType(tensor_c.at(coord))
+          );
+        }
+      }
+    }
+    return *this;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace device
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/conv.hpp ADDED Viewed

	@@ -0,0 +1,782 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for CONV in host-side code.
+*/
+#pragma once
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cute/tensor.hpp"
+#include <cuda_runtime.h>
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::reference::host {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t d_, int32_t h_, int32_t w_, int32_t c_, int32_t g_) {
+  return ((g_ >= 0 && g_ < size<5>(activation)) &&
+          (n_ >= 0 && n_ < size<4>(activation)) &&
+          (d_ >= 0 && d_ < size<3>(activation)) &&
+          (h_ >= 0 && h_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t h_, int32_t w_, int32_t c_, int32_t g_) {
+  return ((g_ >= 0 && g_ < size<4>(activation)) &&
+          (n_ >= 0 && n_ < size<3>(activation)) &&
+          (h_ >= 0 && h_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+template<class EngineAct, class LayoutAct>
+bool
+is_activation_in_bounds(
+    cute::Tensor<EngineAct, LayoutAct> const& activation,
+    int32_t n_, int32_t w_, int32_t c_, int32_t g_) {
+  return ((g_ >= 0 && g_ < size<3>(activation)) &&
+          (n_ >= 0 && n_ < size<2>(activation)) &&
+          (w_ >= 0 && w_ < size<1>(activation)) &&
+          (c_ >= 0 && c_ < size<0>(activation)));
+}
+} // namespace detail
+template<
+  class ElementAcc_,
+  class ElementScalar_,
+  class ElementCompute_,
+  class ElementC_,
+  class ElementOut_,
+  bool ResidualAdd_,
+  class TensorAlpha_,
+  class TensorBeta_,
+  class TensorBias_,
+  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>
+>
+struct ConvEpilogueFusionParams {
+  using ElementAcc = ElementAcc_;
+  using ElementScalar = ElementScalar_;
+  using ElementCompute = ElementCompute_;
+  using ElementC = ElementC_;
+  using ElementOut = ElementOut_;
+  using TensorAlpha = TensorAlpha_;
+  using TensorBeta = TensorBeta_;
+  using TensorBias = TensorBias_;
+  using ActivationFunctor = ActivationFunctor_;
+  static constexpr bool ResidualAdd = ResidualAdd_; // Source added after activation
+  ElementScalar alpha = ElementScalar(1);
+  ElementScalar beta = ElementScalar(0);
+  TensorAlpha tensor_alpha{};
+  TensorBeta tensor_beta{};
+  TensorBias tensor_bias{};
+};
+template<
+  cutlass::conv::Operator ConvOp,
+  int NumSpatialDims,
+  class TensorA,
+  class TensorB,
+  class TensorC,
+  class TensorD,
+  class ShapePadding,
+  class StrideTraversal,
+  class ShapeDilation,
+  class EpilogueFusionParams
+>
+struct ConvReferenceImpl {
+  // Hard code accumlulator type to float to avoid data lost in accumulating add.
+  using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
+  using ElementC = typename EpilogueFusionParams::ElementC;
+  using ElementOut = typename EpilogueFusionParams::ElementOut;
+  using ElementScalar = typename EpilogueFusionParams::ElementScalar;
+  using ElementCompute = typename EpilogueFusionParams::ElementCompute;
+  using ElementBias = typename EpilogueFusionParams::TensorBias::value_type;
+  using ActivationFunctor = typename EpilogueFusionParams::ActivationFunctor;
+  // Input related converter
+  NumericConverter<ElementCompute, ElementAcc> acc_converter;
+  NumericConverter<ElementCompute, ElementC> residual_converter;
+  NumericConverter<ElementCompute, ElementBias> bias_converter;
+  // Scale related converter
+  NumericConverter<ElementCompute, ElementScalar> scale_converter;
+  // Output related converter
+  NumericConverter<ElementOut, ElementCompute> output_converter;
+  EpilogueFusionParams& epi_fusion_params_;
+  TensorA const& tensor_a_;
+  TensorB const& tensor_b_;
+  TensorC const& tensor_c_;
+  TensorD& tensor_d_;
+  ShapePadding const& padding_;
+  StrideTraversal const& tstride_;
+  ShapeDilation const& dilation_;
+  // Epilogue activation operation
+  ActivationFunctor epi_activation;
+  ConvReferenceImpl(
+    TensorA const& tensor_a,
+    TensorB const& tensor_b,
+    TensorC const& tensor_c,
+    TensorD& tensor_d,
+    ShapePadding const& padding,
+    StrideTraversal const& tstride,
+    ShapeDilation const& dilation,
+    EpilogueFusionParams& epi_fusion_params)
+  : tensor_a_(tensor_a),
+    tensor_b_(tensor_b),
+    tensor_c_(tensor_c),
+    tensor_d_(tensor_d),
+    padding_(padding),
+    tstride_(tstride),
+    dilation_(dilation),
+    epi_fusion_params_(epi_fusion_params)
+  {
+    static_assert(rank(ShapePadding{}) == rank(ShapeDilation{}));
+    static_assert(rank(ShapePadding{}) == rank(StrideTraversal{}));
+  }
+  void compute_reference() {
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      fprop_reference(cute::Int<NumSpatialDims>{});
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      dgrad_reference(cute::Int<NumSpatialDims>{});
+    }
+    else {
+      wgrad_reference(cute::Int<NumSpatialDims>{});
+    }
+  }
+private:
+  // Specialization for 1D fprop kernel
+  void fprop_reference(cute::Int<1> spatial_dims) {
+    int32_t G = size<3>(tensor_d_);
+    int32_t N = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+#if defined(_OPENMP)
+  #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t q = 0; q < Q; ++q) {
+          for (int32_t k = 0; k < K; ++k) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t s = 0; s < S; ++s) {
+              for (int32_t c = 0; c < C; ++c) {
+                int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                if (detail::is_activation_in_bounds(tensor_a_, n, w, c, g)) {
+                  auto a = tensor_a_(c, w, n, g);
+                  auto b = tensor_b_(c, s, k, g);
+                  accumulator += ElementAcc(a * b);
+                }
+              }
+            }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+              epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+              epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+            if (not EpilogueFusionParams::ResidualAdd) {
+              output += scale_converter(beta) * residual_converter(tensor_c_(k, q, n, g));
+            }
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+            }
+            output = epi_activation(output);
+            if (EpilogueFusionParams::ResidualAdd) {
+              output += scale_converter(beta) * residual_converter(tensor_c_(k, q, n, g));
+            }
+            tensor_d_(k, q, n, g) = output_converter(output);
+          }
+        }
+      }
+    }
+  }
+  // Specialization for 2D fprop kernel
+  void fprop_reference(cute::Int<2> spatial_dims) {
+    int32_t G = size<4>(tensor_d_);
+    int32_t N = size<3>(tensor_d_);
+    int32_t P = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t p = 0; p < P; ++p) {
+          for (int32_t q = 0; q < Q; ++q) {
+            for (int32_t k = 0; k < K; ++k) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t r = 0; r < R; ++r) {
+                for (int32_t s = 0; s < S; ++s) {
+                  for (int32_t c = 0; c < C; ++c) {
+                    int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                    int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                    if (detail::is_activation_in_bounds(tensor_a_, n, h, w, c, g)) {
+                      auto a = tensor_a_(c, w, h, n, g);
+                      auto b = tensor_b_(c, s, r, k, g);
+                      accumulator += ElementAcc(a * b);
+                    }
+                  }
+                }
+              }
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+              if (not EpilogueFusionParams::ResidualAdd) {
+                output += scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n, g));
+              }
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+              }
+              output = epi_activation(output);
+              if (EpilogueFusionParams::ResidualAdd) {
+                output += scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n, g));
+              }
+              tensor_d_(k, q, p, n, g) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+  }
+  // Specialization for 3D fprop kernel
+  void fprop_reference(cute::Int<3> spatial_dims) {
+    int32_t G = size<5>(tensor_d_);
+    int32_t N = size<4>(tensor_d_);
+    int32_t Z = size<3>(tensor_d_);
+    int32_t P = size<2>(tensor_d_);
+    int32_t Q = size<1>(tensor_d_);
+    int32_t K = size<0>(tensor_d_);
+    int32_t T = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+    int32_t C = size<0>(tensor_b_);
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t z = 0; z < Z; ++z) {
+          for (int32_t p = 0; p < P; ++p) {
+            for (int32_t q = 0; q < Q; ++q) {
+              for (int32_t k = 0; k < K; ++k) {
+                auto accumulator = ElementAcc(0);
+                for (int32_t t = 0; t < T; ++t) {
+                  for (int32_t r = 0; r < R; ++r) {
+                    for (int32_t s = 0; s < S; ++s) {
+                      for (int32_t c = 0; c < C; ++c) {
+                        int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                        int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                        int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
+                        if (detail::is_activation_in_bounds(tensor_a_, n, d, h, w, c, g)) {
+                          auto a = tensor_a_(c, w, h, d, n, g);
+                          auto b = tensor_b_(c, s, r, t, k, g);
+                          accumulator += ElementAcc(a * b);
+                        }
+                      }
+                    }
+                  }
+                }
+                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                  epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                  epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+                if (not EpilogueFusionParams::ResidualAdd) {
+                  output += scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n, g));
+                }
+                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                  output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+                }
+                output = epi_activation(output);
+                if (EpilogueFusionParams::ResidualAdd) {
+                  output += scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n, g));
+                }
+                tensor_d_(k, q, p, z, n, g) = output_converter(output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  // Specialization for 1D dgrad kernel
+  void dgrad_reference(cute::Int<1> spatial_dims) {
+    int32_t G = size<3>(tensor_d_);
+    int32_t N = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+#if defined(_OPENMP)
+   #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t w = 0; w < W; ++w) {
+          for (int32_t c = 0; c < C; ++c) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t k = 0; k < K; ++k) {
+              for (int32_t s = 0; s < S; ++s) {
+                int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                if (q % cute::get<0>(tstride_) == 0) {
+                  q /= cute::get<0>(tstride_);
+                } else {
+                  continue;
+                }
+                if (detail::is_activation_in_bounds(tensor_a_, n, q, k, g)) {
+                  accumulator += ElementAcc(tensor_a_(k, q, n, g) * tensor_b_(c, s, k, g));
+                }
+              }
+            }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+              ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+              ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+            if (not EpilogueFusionParams::ResidualAdd) {
+              output += scale_converter(beta) * residual_converter(tensor_c_(c, w, n, g));
+            }
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+            }
+            output = epi_activation(output);
+            if (EpilogueFusionParams::ResidualAdd) {
+              output += scale_converter(beta) * residual_converter(tensor_c_(c, w, n, g));
+            }
+            tensor_d_(c, w, n, g) = output_converter(output);
+          }
+        }
+      }
+    }
+  }
+  // Specialization for 2D dgrad kernel
+  void dgrad_reference(cute::Int<2> spatial_dims) {
+    int32_t G = size<4>(tensor_d_);
+    int32_t N = size<3>(tensor_d_);
+    int32_t H = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t h = 0; h < H; ++h) {
+          for (int32_t w = 0; w < W; ++w) {
+            for (int32_t c = 0; c < C; ++c) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t k = 0; k < K; ++k) {
+                for (int32_t r = 0; r < R; ++r) {
+                  for (int32_t s = 0; s < S; ++s) {
+                    int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                    int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
+                    if (q % cute::get<0>(tstride_) == 0) {
+                      q /= cute::get<0>(tstride_);
+                    } else {
+                      continue;
+                    }
+                    if (p % cute::get<1>(tstride_) == 0) {
+                      p /= cute::get<1>(tstride_);
+                    } else {
+                      continue;
+                    }
+                    if (detail::is_activation_in_bounds(tensor_a_, n, p, q, k, g)) {
+                      accumulator += ElementAcc(tensor_a_(k, q, p, n, g) * tensor_b_(c, s, r, k, g));
+                    }
+                  }
+                }
+              }
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+                ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+                ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+              if (not EpilogueFusionParams::ResidualAdd) {
+                output += scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n, g));
+              }
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+              }
+              output = epi_activation(output);
+              if (EpilogueFusionParams::ResidualAdd) {
+                output += scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n, g));
+              }
+              tensor_d_(c, w, h, n, g) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+  }
+  // Specialization for 3D dgrad kernel
+  void dgrad_reference(cute::Int<3> spatial_dims) {
+    int32_t G = size<5>(tensor_d_);
+    int32_t N = size<4>(tensor_d_);
+    int32_t D = size<3>(tensor_d_);
+    int32_t H = size<2>(tensor_d_);
+    int32_t W = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+    int32_t K = size<4>(tensor_b_);
+    int32_t T = size<3>(tensor_b_);
+    int32_t R = size<2>(tensor_b_);
+    int32_t S = size<1>(tensor_b_);
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t d = 0; d < D; ++d) {
+          for (int32_t h = 0; h < H; ++h) {
+            for (int32_t w = 0; w < W; ++w) {
+              for (int32_t c = 0; c < C; ++c) {
+                auto accumulator = ElementAcc(0);
+                for (int32_t k = 0; k < K; ++k) {
+                  for (int32_t t = 0; t < T; ++t) {
+                    for (int32_t r = 0; r < R; ++r) {
+                      for (int32_t s = 0; s < S; ++s) {
+                        int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                        int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
+                        int32_t z = d + cute::get<2>(padding_) - t * cute::get<2>(dilation_);
+                        if (q % cute::get<0>(tstride_) == 0) {
+                          q /= cute::get<0>(tstride_);
+                        } else {
+                          continue;
+                        }
+                        if (p % cute::get<1>(tstride_) == 0) {
+                          p /= cute::get<1>(tstride_);
+                        } else {
+                          continue;
+                        }
+                        if (z % cute::get<2>(tstride_) == 0) {
+                          z /= cute::get<2>(tstride_);
+                        } else {
+                          continue;
+                        }
+                        if (detail::is_activation_in_bounds(tensor_a_, n, z, p, q, k, g)) {
+                          accumulator += ElementAcc(tensor_a_(k, q, p, z, n, g) * tensor_b_(c, s, r, t, k, g));
+                        }
+                      }
+                    }
+                  }
+                }
+                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+                  ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+                  ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+                if (not EpilogueFusionParams::ResidualAdd) {
+                  output += scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n, g));
+                }
+                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                  output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+                }
+                output = epi_activation(output);
+                if (EpilogueFusionParams::ResidualAdd) {
+                  output += scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n, g));
+                }
+                tensor_d_(c, w, h, d, n, g) = output_converter(output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  // Specialization for 1D wgrad kernel
+  void wgrad_reference(cute::Int<1> spatial_dims) {
+    int32_t G = size<3>(tensor_d_);
+    int32_t N =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(2)
+#endif
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t k = 0; k < K; ++k) {
+        for (int32_t s = 0; s < S; ++s) {
+          for (int32_t c = 0; c < C; ++c) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t n = 0; n < N; ++n) {
+              for (int32_t q = 0; q < Q; ++q) {
+                int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                bool is_in_bounds =
+                    detail::is_activation_in_bounds(tensor_b_, n, w, c, g);
+                if (is_in_bounds) {
+                  auto act =
+                      tensor_b_(c, w, n, g);
+                  auto xformed_act =
+                      tensor_a_(k, q, n, g);
+                  accumulator += ElementAcc(act * xformed_act);
+                }
+              }
+            }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+              epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+              epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+            if (not EpilogueFusionParams::ResidualAdd) {
+              output += scale_converter(beta) * residual_converter(tensor_c_(c, s, k, g));
+            }
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+            }
+            output = epi_activation(output);
+            if (EpilogueFusionParams::ResidualAdd) {
+              output += scale_converter(beta) * residual_converter(tensor_c_(c, s, k, g));
+            }
+            tensor_d_(c, s, k, g) = output_converter(output);
+          }
+        }
+      }
+    }
+  }
+  // Specialization for 2D wgrad kernel
+  void wgrad_reference(cute::Int<2> spatial_dims) {
+    int32_t G = size<4>(tensor_d_);
+    int32_t N =
+        size<3>(tensor_a_);
+    int32_t P =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t R = size<2>(tensor_d_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t k = 0; k < K; ++k) {
+        for (int32_t r = 0; r < R; ++r) {
+          for (int32_t s = 0; s < S; ++s) {
+            for (int32_t c = 0; c < C; ++c) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t n = 0; n < N; ++n) {
+                for (int32_t p = 0; p < P; ++p) {
+                  for (int32_t q = 0; q < Q; ++q) {
+                    int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                    int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                    bool is_in_bounds =
+                        detail::is_activation_in_bounds(tensor_b_, n, h, w, c, g);
+                    if (is_in_bounds) {
+                      auto act =
+                          tensor_b_(c, w, h, n, g);
+                      auto xformed_act =
+                          tensor_a_(k, q, p, n, g);
+                      accumulator += ElementAcc(act * xformed_act);
+                    }
+                  }
+                }
+              }
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+              if (not EpilogueFusionParams::ResidualAdd) {
+                output += scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k, g));
+              }
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+              }
+              output = epi_activation(output);
+              if (EpilogueFusionParams::ResidualAdd) {
+                output += scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k, g));
+              }
+              tensor_d_(c, s, r, k, g) = output_converter(output);
+            }
+          }
+        }
+      }
+    }
+  }
+  // Specialization for 3D wgrad kernel
+  void wgrad_reference(cute::Int<3> spatial_dims) {
+    int32_t G = size<5>(tensor_d_);
+    int32_t N =
+        size<4>(tensor_a_);
+    int32_t Z =
+        size<3>(tensor_a_);
+    int32_t P =
+        size<2>(tensor_a_);
+    int32_t Q =
+        size<1>(tensor_a_);
+    int32_t K =
+        size<0>(tensor_a_);
+    int32_t T = size<3>(tensor_d_);
+    int32_t R = size<2>(tensor_d_);
+    int32_t S = size<1>(tensor_d_);
+    int32_t C = size<0>(tensor_d_);
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int32_t g = 0 ; g < G; ++g) {
+      for (int32_t k = 0; k < K; ++k) {
+        for (int32_t t = 0; t < T; ++t) {
+          for (int32_t r = 0; r < R; ++r) {
+            for (int32_t s = 0; s < S; ++s) {
+              for (int32_t c = 0; c < C; ++c) {
+                auto accumulator = ElementAcc(0);
+                for (int32_t n = 0; n < N; ++n) {
+                  for (int32_t z = 0; z < Z; ++z) {
+                    for (int32_t p = 0; p < P; ++p) {
+                      for (int32_t q = 0; q < Q; ++q) {
+                        int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                        int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                        int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
+                        bool is_in_bounds =
+                            detail::is_activation_in_bounds(tensor_b_, n, d, h, w, c, g);
+                        if (is_in_bounds) {
+                          auto act =
+                              tensor_b_(c, w, h, d, n, g);
+                          auto xformed_act =
+                              tensor_a_(k, q, p, z, n, g);
+                          accumulator += ElementAcc(act * xformed_act);
+                        }
+                      }
+                    }
+                  }
+                }
+                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                  epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                  epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
+                if (not EpilogueFusionParams::ResidualAdd) {
+                  output += scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k, g));
+                }
+                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                  output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+                }
+                output = epi_activation(output);
+                if (EpilogueFusionParams::ResidualAdd) {
+                  output += scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k, g));
+                }
+                tensor_d_(c, s, r, t, k, g) = output_converter(output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // cutlass::reference::host
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/convolution.h ADDED Viewed

	@@ -0,0 +1,802 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for convolution in host-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include <iostream>
+namespace cutlass {
+namespace reference {
+namespace host {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Forward propagation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// y = conv2d(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dFprop(
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementD, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int p = 0; p < problem_size.P; ++p) {
+      for (int q = 0; q < problem_size.Q; ++q) {
+        for (int k = 0; k < problem_size.K; ++k) {
+          int group_idx = k / (problem_size.K / problem_size.groups);
+          int channels_per_group = problem_size.C / problem_size.groups;
+          ElementAccumulator acc = ElementAccumulator();
+          for (int r = 0; r < problem_size.R; ++r) {
+            for (int s = 0; s < problem_size.S; ++s) {
+              for (int c = 0; c < channels_per_group; ++c) {
+                int filter_r = r;
+                int filter_s = s;
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+                int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+                int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+                if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
+                  ElementA a = tensor_x.at({n, h, w, c + group_idx * channels_per_group});
+                  ElementB b = tensor_w.at({k, r, s, c});
+                  acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                }
+              }
+            }
+          }
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+          if (beta != ElementCompute()) {
+            c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k));
+          }
+          tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        }
+      }
+    }
+  }
+}
+/// Depthwise-separable convolution
+template <typename ElementA,
+          typename LayoutA,
+          typename ElementB,
+          typename LayoutB,
+          typename ElementC,
+          typename LayoutC,
+          typename ElementCompute,
+          typename ElementAccumulator = ElementCompute,
+          typename ElementD = ElementC,
+          typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+          typename InnerProductOp = multiply_add<ElementAccumulator>>
+void Depsep_Fprop(cutlass::TensorView<ElementA, LayoutA> tensor_A,
+                  cutlass::TensorView<ElementB, LayoutB> tensor_B,
+                  cutlass::TensorView<ElementC, LayoutC> tensor_C,
+                  cutlass::TensorView<ElementD, LayoutC> tensor_D,
+                  ElementCompute alpha,
+                  ElementCompute beta,
+                  cutlass::Tensor4DCoord padding = cutlass::Tensor4DCoord(),
+                  cutlass::Coord<2> conv_stride = cutlass::Coord<2>(),
+                  cutlass::Coord<2> dilation = cutlass::Coord<2>(),
+                  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < tensor_C.extent().n(); ++n) {
+    for (int p = 0; p < tensor_C.extent().h(); ++p) {
+      for (int q = 0; q < tensor_C.extent().w(); ++q) {
+        for (int g = 0; g < tensor_C.extent().c(); ++g) {
+          ElementAccumulator acc = ElementAccumulator();
+          for (int r = 0; r < tensor_B.extent().h(); ++r) {
+            for (int s = 0; s < tensor_B.extent().w(); ++s) {
+              // input activation H and W
+              int h = p * conv_stride[0] - padding[0] + r * dilation[0];
+              int w = q * conv_stride[1] - padding[2] + s * dilation[1];
+              if (h < tensor_A.extent().h() && h >= 0 && w < tensor_A.extent().w() && w >= 0) {
+                ElementA a = tensor_A.at(cutlass::make_Coord(n, h, w, g));
+                ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
+                                   ? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
+                                   : tensor_B.at(cutlass::make_Coord(
+                                         g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0));
+                acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+              }
+            }
+          }
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
+          tensor_D.at(cutlass::make_Coord(n, p, q, g)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        }
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dgrad / Deconv
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dDgrad(
+  cutlass::conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementD, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  bool is_deconv = false) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int h = 0; h < problem_size.H; ++h) {
+      for (int w = 0; w < problem_size.W; ++w) {
+        for (int c = 0; c < problem_size.C; ++c) {
+          ElementAccumulator acc = ElementAccumulator();
+          for (int r = 0; r < problem_size.R; ++r) {
+            for (int s = 0; s < problem_size.S; ++s) {
+              for (int k = 0; k < problem_size.K; ++k) {
+                int filter_r = r;
+                int filter_s = s;
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+                int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
+                int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
+                if (p >= 0 && (p % problem_size.stride_h) == 0 &&
+                    q >= 0 && (q % problem_size.stride_w) == 0) {
+                  p = p / problem_size.stride_h;
+                  q = q / problem_size.stride_w;
+#if 0
+                  std::cout << "row:"
+                  << n * problem_size.H * problem_size.W +
+                    h * problem_size.W +
+                    w << " "
+                  << "n, p, q: ("
+                  << n << ", "
+                  << p << ", "
+                  << q << ") * "
+                  << "r, s: ("
+                  << r << ", "
+                  << s << ") ["
+                  << ((p < problem_size.P && q < problem_size.Q) ? "true":"false") << "]"
+                  << std::endl;
+#endif
+                  if (p < problem_size.P && q < problem_size.Q) {
+                    ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
+                    ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, r, s, k))
+                        : tensor_w.at(cutlass::make_Coord(k, r, s, c));
+                    acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                  }
+                }
+              } // for (K)
+            } // for (S)
+          } // for (R)
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+          if (beta != ElementCompute()) {
+            c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c));
+          }
+          tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        } // for (C)
+      } // for (W)
+    } // for (H)
+  } // for (N)
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Wgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2dWgrad(
+  cutlass::conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementD, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  InnerProductOp inner_product_op;
+  ConvertOp convert_op;
+  // Apply MMA and accumulate ElementAccumulator
+  for (int k = 0; k < problem_size.K; ++k) {
+    for (int r = 0; r < problem_size.R; ++r) {
+      for (int s = 0; s < problem_size.S; ++s) {
+        for (int c = 0; c < problem_size.C; ++c) {
+          ElementAccumulator acc = ElementAccumulator();
+          for (int n = 0; n < problem_size.N; ++n) {
+            for (int p = 0; p < problem_size.P; ++p) {
+              for (int q = 0; q < problem_size.Q; ++q) {
+                cutlass::Tensor4DCoord b_coord;
+                int filter_r = r;
+                int filter_s = s;
+                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                  filter_r = problem_size.R - 1 - r;
+                  filter_s = problem_size.S - 1 - s;
+                }
+                b_coord = make_Coord(
+                    n,
+                    p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
+                    q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
+                    c);
+                if (b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
+                    b_coord.w() < problem_size.W && b_coord.w() >= 0) {
+                  ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k)));
+                  ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
+                  acc = inner_product_op(a, b, acc);
+                }
+              }
+            }
+          }
+          // Apply Epilogue, compute ElementCompute, convert and store ElementC
+          ElementC c_ref = ElementC();
+          if (beta != ElementCompute()) {
+            c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c));
+          }
+          tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) =
+              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+        } // for (C)
+      } // for (S)
+    } // for (R)
+  } // for (K)
+}
+/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv2d(
+  conv::Operator convolutional_operator,
+  conv::Conv2dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementD, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    Conv2dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ElementD,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+  case conv::Operator::kDeconv:
+  case conv::Operator::kDgrad:
+    Conv2dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ElementD,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
+    break;
+  case conv::Operator::kWgrad:
+    Conv2dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ElementD,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+  default:
+    break;
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// 3D convolution
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// y = conv3d(x, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dFprop(
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_x,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_y_in,
+  TensorRef<ElementC, LayoutC> tensor_y_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int z = 0; z < problem_size.Z; ++z) {
+      for (int p = 0; p < problem_size.P; ++p) {
+        for (int q = 0; q < problem_size.Q; ++q) {
+          for (int k = 0; k < problem_size.K; ++k) {
+            ElementAccumulator acc = ElementAccumulator();
+            for (int t = 0; t < problem_size.T; ++t) {
+              for (int r = 0; r < problem_size.R; ++r) {
+                for (int s = 0; s < problem_size.S; ++s) {
+                  for (int c = 0; c < problem_size.C; ++c) {
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+                    int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
+                    int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
+                    int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
+                    if (d >= 0 && d < problem_size.D &&
+                      h >=0 && h < problem_size.H &&
+                      w >= 0 && w < problem_size.W) {
+                      ElementA a = tensor_x.at({n, d, h, w, c});
+                      ElementB b = tensor_w.at({k, t, r, s, c});
+                      acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                    }
+                  }
+                }
+              }
+            }
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+            if (beta != ElementCompute()) {
+              c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k));
+            }
+            tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+          }
+        }
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dgrad / Deconv
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// dx = dgrad(dy, w)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dDgrad(
+  cutlass::conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_w,
+  TensorRef<ElementC, LayoutC> tensor_dx_in,
+  TensorRef<ElementC, LayoutC> tensor_dx_out,
+  ElementCompute alpha,
+  ElementCompute beta,
+  bool is_deconv = false) {
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  // Apply MMA and accumulate ElementAccumulator
+  for (int n = 0; n < problem_size.N; ++n) {
+    for (int d = 0; d < problem_size.D; ++d) {
+      for (int h = 0; h < problem_size.H; ++h) {
+        for (int w = 0; w < problem_size.W; ++w) {
+          for (int c = 0; c < problem_size.C; ++c) {
+            ElementAccumulator acc = ElementAccumulator();
+            for (int t = 0; t < problem_size.T; ++t) {
+              for (int r = 0; r < problem_size.R; ++r) {
+                for (int s = 0; s < problem_size.S; ++s) {
+                  for (int k = 0; k < problem_size.K; ++k) {
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+                    int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d;
+                    int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
+                    int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
+                    if (z >= 0 && (z % problem_size.stride_d) == 0 &&
+                        p >= 0 && (p % problem_size.stride_h) == 0 &&
+                        q >= 0 && (q % problem_size.stride_w) == 0) {
+                      z = z / problem_size.stride_d;
+                      p = p / problem_size.stride_h;
+                      q = q / problem_size.stride_w;
+                      if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
+                        ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
+                        ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, t, r, s, k))
+                            : tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
+                        acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
+                      }
+                    }
+                  } // for (K)
+                } // for (S)
+              } // for (R)
+            } // for (T)
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+            if (beta != ElementCompute()) {
+              c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c));
+            }
+            tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+          } // for (C)
+        } // for (W)
+      } // for (H)
+    } // for (D)
+  } // for (N)
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Wgrad
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// dw = wgrad(dy, x)
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3dWgrad(
+  cutlass::conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_dy,
+  TensorRef<ElementB, LayoutB> tensor_x,
+  TensorRef<ElementC, LayoutC> tensor_dw_in,
+  TensorRef<ElementC, LayoutC> tensor_dw_out,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  InnerProductOp inner_product_op;
+  ConvertOp convert_op;
+  // Apply MMA and accumulate ElementAccumulator
+  for (int k = 0; k < problem_size.K; ++k) {
+    for (int t = 0; t < problem_size.T; ++t) {
+      for (int r = 0; r < problem_size.R; ++r) {
+        for (int s = 0; s < problem_size.S; ++s) {
+          for (int c = 0; c < problem_size.C; ++c) {
+            ElementAccumulator acc = ElementAccumulator();
+            for (int n = 0; n < problem_size.N; ++n) {
+              for (int z = 0; z < problem_size.Z; ++z) {
+                for (int p = 0; p < problem_size.P; ++p) {
+                  for (int q = 0; q < problem_size.Q; ++q) {
+                    int filter_t = t;
+                    int filter_r = r;
+                    int filter_s = s;
+                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
+                      filter_t = problem_size.T - 1 - t;
+                      filter_r = problem_size.R - 1 - r;
+                      filter_s = problem_size.S - 1 - s;
+                    }
+                    Tensor5DCoord b_coord = make_Coord(
+                        n,
+                        z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d,
+                        p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
+                        q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
+                        c);
+                    if (b_coord.d() < problem_size.D && b_coord.d() >= 0 &&
+                        b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
+                        b_coord.w() < problem_size.W && b_coord.w() >= 0) {
+                      ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)));
+                      ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
+                      acc = inner_product_op(a, b, acc);
+                    }
+                  }
+                }
+              }
+            }
+            // Apply Epilogue, compute ElementCompute, convert and store ElementC
+            ElementC c_ref = ElementC();
+            if (beta != ElementCompute()) {
+              c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c));
+            }
+            tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) =
+                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
+          } // for (C)
+        } // for (S)
+      } // for (R)
+    } // for (T)
+  } // for (K)
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator = ElementCompute,
+  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
+  typename InnerProductOp = multiply_add<ElementAccumulator>
+>
+void Conv3d(
+  conv::Operator convolutional_operator,
+  conv::Conv3dProblemSize problem_size,
+  TensorRef<ElementA, LayoutA> tensor_A,
+  TensorRef<ElementB, LayoutB> tensor_B,
+  TensorRef<ElementC, LayoutC> tensor_C,
+  TensorRef<ElementC, LayoutC> tensor_D,
+  ElementCompute alpha,
+  ElementCompute beta) {
+  switch (convolutional_operator) {
+  case conv::Operator::kFprop:
+    Conv3dFprop<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+  case conv::Operator::kDeconv:
+  case conv::Operator::kDgrad:
+    Conv3dDgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
+    break;
+  case conv::Operator::kWgrad:
+    Conv3dWgrad<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp, InnerProductOp
+    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
+    break;
+  default:
+    break;
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace host
+}  // namespace reference
+}  // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/error_metrics.h ADDED Viewed

	@@ -0,0 +1,66 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include <cmath>
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+#include "cutlass/core_io.h"
+namespace cutlass  {
+namespace reference {
+namespace host {
+/// Helper to compute the relative error metric for tensor A_computed  w.r.t. to tensor A_reference
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorRelativeErrorMetric(
+  TensorView<Element, Layout> view_A_computed,
+  TensorView<Element, Layout> view_B_reference,
+  ComputeType identity = ComputeType()
+) {
+  return cutlass::reference::host::TensorNormDiff(view_A_computed, view_B_reference, identity) /
+   cutlass::reference::host::TensorNorm(view_B_reference, identity);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm.h ADDED Viewed

	@@ -0,0 +1,531 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GEMM in host-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+template<typename Out, typename In>
+struct CastIfScalar {
+  static Out cast(In in) {
+    return Out(in);
+  }
+};
+template<typename OutScalar, typename In>
+struct CastIfScalar<cutlass::complex<OutScalar>, In> {
+  typedef cutlass::complex<OutScalar> Out;
+  static Out cast(In in) {
+    return Out(static_cast<OutScalar>(in));
+  }
+};
+template<typename OutScalar, typename InScalar>
+struct CastIfScalar<cutlass::complex<OutScalar>, cutlass::complex<InScalar>> {
+  typedef cutlass::complex<OutScalar> Out;
+  typedef cutlass::complex<InScalar> In;
+  static Out cast(In in) {
+    return Out(in);
+  }
+};
+template<typename Out, typename In>
+Out cast_if_scalar(In in) {
+  return CastIfScalar<Out, In>::cast(in);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+      ComputeType accum[Mblock][Nblock];
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            if (row < M && col < N) {
+              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+              ElementB b = tensor_b.at(MatrixCoord(k_block, col));
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
+            }
+          }
+        }
+      }
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+          MatrixCoord coord = MatrixCoord(row, col);
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_gemm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  ComputeType initial_accum) {
+  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
+      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+      initial_accum);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Gemm;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddFastBF16> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add-saturate
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddSaturate> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>,
+                 NumericConverterClamp<ElementC, ScalarType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>,
+                 NumericConverterClamp<ElementC, ScalarType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for XOR-popc
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpXorPopc> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, xor_popc_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, xor_popc_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+/// Partial specialization for AND-popc
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpAndPopc> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, and_popc_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, and_popc_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAddFastF32> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Batched GEMM
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a batch of GEMMs over a set of matrices of common dimension.
+//
+// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
+//
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  int batch_count,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c,
+  AccumulatorType initial_accum) {
+  typename TensorRefCollectionA::ConstIterator tensor_a_it = tensor_a.begin();
+  typename TensorRefCollectionB::ConstIterator tensor_b_it = tensor_b.begin();
+  typename TensorRefCollectionC::ConstIterator tensor_c_it = tensor_c.begin();
+  for (int batch = 0;
+    batch < batch_count;
+    ++batch, ++tensor_a_it, ++tensor_b_it, ++tensor_c_it) {
+    Gemm<typename TensorRefCollectionA::Element,
+         typename TensorRefCollectionA::Layout,
+         typename TensorRefCollectionB::Element,
+         typename TensorRefCollectionB::Layout,
+         typename TensorRefCollectionC::Element,
+         typename TensorRefCollectionC::Layout,
+         typename TensorRefCollectionC::Element,
+         typename TensorRefCollectionC::Element>
+        gemm;
+    gemm(problem_size, alpha, *tensor_a_it, *tensor_b_it, beta, *tensor_c_it,
+         initial_accum);
+  }
+}
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+//
+// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
+//
+template <
+  typename TensorRefCollectionA,
+  typename TensorRefCollectionB,
+  typename TensorRefCollectionC,
+  typename ScalarType,
+  typename AccumulatorType
+>
+void BatchedGemm(
+  gemm::GemmCoord problem_size,
+  int batch_count,
+  ScalarType alpha,
+  TensorRefCollectionA const& tensor_a,
+  TensorRefCollectionB const& tensor_b,
+  ScalarType beta,
+  TensorRefCollectionC &tensor_c) {
+  BatchedGemm(problem_size, batch_count, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm_complex.h ADDED Viewed

	@@ -0,0 +1,210 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in host-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ElementD = ElementC,
+  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+        ComputeType accum[Mblock][Nblock];
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+              if (row < M && col < N) {
+                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+                ElementB b = tensor_b.at(MatrixCoord(k_block, col));
+                ComputeType a_ik = ComputeType(a);
+                ComputeType b_kj = ComputeType(b);
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_ik = conj(a_ik);
+                }
+                if (transform_b == ComplexTransform::kConjugate) {
+                  b_kj = conj(b_kj);
+                }
+                accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+              }
+            }
+          }
+        }
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            MatrixCoord coord = MatrixCoord(row, col);
+            if (row < M && col < N) {
+              tensor_d.at(coord) = convert_op(
+                alpha * ScalarType(accum[i][j]) +
+                beta * ScalarType(tensor_c.at(coord)));
+            }
+          }
+        }
+      } // for (col_block)
+    } // for (row_block)
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_b.add_pointer_offset(batch_stride_B);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+  } // for (batch_idx)
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ElementD = ElementC
+>
+void GemmComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementD, LayoutC> tensor_d) {
+  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h ADDED Viewed

	@@ -0,0 +1,228 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in host-side code.
+*/
+#pragma once
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_ref_planar_complex.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<complex<ComputeType>>
+>
+void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
+  complex<ComputeType> initial_accum) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
+  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
+  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+      complex<ComputeType> accum[Mblock][Nblock];
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            if (row < M && col < N) {
+              ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
+              ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
+              complex<ComputeType> a = complex<ComputeType>{
+                ComputeType(a_ik.real()),
+                ComputeType(a_ik.imag())
+              };
+              complex<ComputeType> b = complex<ComputeType>{
+                ComputeType(b_kj.real()),
+                ComputeType(b_kj.imag())
+              };
+              if (transform_a == ComplexTransform::kConjugate) {
+                a = conj(a);
+              }
+              if (transform_b == ComplexTransform::kConjugate) {
+                b = conj(b);
+              }
+              accum[i][j] = inner_product_op(a, b,  accum[i][j]);
+            }
+          }
+        }
+      }
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+          MatrixCoord coord = MatrixCoord(row, col);
+          if (row < M && col < N) {
+            complex<ScalarType> acc{
+              ScalarType(accum[i][j].real()),
+              ScalarType(accum[i][j].imag())
+            };
+            ComplexC d_ij = tensor_c.at(coord);
+            complex<ScalarType> src{
+              ScalarType(d_ij.real()),
+              ScalarType(d_ij.imag())
+            };
+            complex<ScalarType> result = alpha * acc + beta * src;
+            d_ij.real() = convert_op(result.real());
+            d_ij.imag() = convert_op(result.imag());
+            tensor_d.at(coord) = d_ij;
+          }
+        }
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void GemmPlanarComplex(
+  gemm::GemmCoord problem_size,
+  complex<ScalarType> alpha,
+  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  complex<ScalarType> beta,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
+  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
+  GemmPlanarComplex(
+    problem_size,
+    alpha,
+    tensor_a, transform_a,
+    tensor_b, transform_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    complex<ScalarType>());
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gett.hpp ADDED Viewed

	@@ -0,0 +1,916 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for GETT in host-side code.
+*/
+#pragma once
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/relatively_equal.h"
+#include "cute/tensor.hpp"
+#include "cute/pointer.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::reference::host {
+template<class T, class = void>
+struct ElementTraits {
+  using type = T;
+};
+template<class T>
+struct ElementTraits<T, std::enable_if_t<!std::is_same_v<decltype(std::declval<T>().get()), void> > >  {
+  using type = decltype(std::declval<T>().get());
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+//
+// Gett Mainloop Parameters
+//
+///////////////////////////////////////////////////////////
+template<
+  class ElementAccumulator_,
+  class TensorA_,                                                                                         // (M, K, L)
+  class TensorB_                                                                                          // (N, K, L)
+  , class TensorSfA_ = TensorA_,
+  class TensorSfB_ = TensorB_
+>
+struct GettMainloopParams {
+  using ElementAccumulator = ElementAccumulator_;
+  using TensorA = TensorA_;
+  using TensorB = TensorB_;
+  using EngineA = typename TensorA::engine_type;
+  using LayoutA = typename TensorA::layout_type;
+  using EngineB = typename TensorB::engine_type;
+  using LayoutB = typename TensorB::layout_type;
+  TensorA A{};
+  TensorB B{};
+  ComplexTransform transform_A = ComplexTransform::kNone;
+  ComplexTransform transform_B = ComplexTransform::kNone;
+  using TensorSfA = TensorSfA_;
+  using TensorSfB = TensorSfB_;
+  using EngineSfA = typename TensorSfA::engine_type;
+  using LayoutSfA = typename TensorSfA::layout_type;
+  using EngineSfB = typename TensorSfB::engine_type;
+  using LayoutSfB = typename TensorSfB::layout_type;
+  TensorSfA_ SfA{};
+  TensorSfB_ SfB{};
+  GettMainloopParams() {}
+  GettMainloopParams(TensorA tensor_A, TensorB tensor_B)
+    : A(tensor_A), B(tensor_B) {}
+  GettMainloopParams(TensorA tensor_A, TensorSfA tensor_SfA, TensorB tensor_B, TensorSfB tensor_SfB)
+    : A(tensor_A), SfA(tensor_SfA),
+      B(tensor_B), SfB(tensor_SfB) {}
+};
+////////////////////////////////////////////////////////////////////////
+//
+// Gett Mainloop Parameter Specialization for Block Scaled GEMM kernels
+//
+////////////////////////////////////////////////////////////////////////
+template<
+  class ElementAccumulator_,
+  class TensorA_,                                                                                          // (M, K, L)
+  class TensorSfA_,                                                                                        // (M, K, L)
+  class TensorB_,                                                                                          // (N, K, L)
+  class TensorSfB_                                                                                         // (N, K, L)
+>
+struct GettBlockScalingMainloopParams : public GettMainloopParams<ElementAccumulator_, TensorA_, TensorB_, TensorSfA_, TensorSfB_> {
+  using Base = GettMainloopParams<ElementAccumulator_, TensorA_, TensorB_, TensorSfA_, TensorSfB_>;
+  using ElementAccumulator = typename Base::ElementAccumulator;
+  using TensorA = typename Base::TensorA;
+  using TensorB = typename Base::TensorB;
+  using EngineA = typename Base::EngineA;
+  using LayoutA = typename Base::LayoutA;
+  using EngineB = typename Base::EngineB;
+  using LayoutB = typename Base::LayoutB;
+  ComplexTransform transform_A = Base::transform_A;
+  ComplexTransform transform_B = Base::transform_B;
+  using TensorSfA  = typename Base::TensorSfA;
+  using TensorSfB  = typename Base::TensorSfB;
+  using EngineSfA  = typename Base::EngineSfA;
+  using LayoutSfA  = typename Base::LayoutSfA;
+  using EngineSfB  = typename Base::EngineSfB;
+  using LayoutSfB  = typename Base::LayoutSfB;
+  GettBlockScalingMainloopParams() {}
+  GettBlockScalingMainloopParams(TensorA tensor_A, TensorSfA tensor_SfA, TensorB tensor_B, TensorSfB tensor_SfB)
+    : Base(tensor_A, tensor_SfA, tensor_B, tensor_SfB) {}
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+enum class SfStrategy {
+  None = 0,
+  SfDGen = 1
+};
+///////////////////////////////////////////////////////////
+//
+// Gett Epilogue Parameters
+//
+///////////////////////////////////////////////////////////
+template<
+  class ElementScalar_,
+  class ElementScalingFactor_,
+  class ElementAccumulator_,
+  class ElementCompute_,
+  class TensorC_,                                                                                                      // (M, N, L)
+  class TensorD_,                                                                                                      // (M, N, L)
+  class VectorBias_  = decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})),  //    (M, 1)
+  class TensorAux_   = decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})),  // (M, N, L)
+  class VectorAlpha_ = decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})),  //    (M, 1)
+  class VectorBeta_ = VectorAlpha_,                                                                                    //    (M, 1)
+  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>,
+  class TensorSFD_ = TensorD_,
+  class SFD_VectorSize_ = cute::Int<0>,
+  class BiasBinaryOp_ = cutlass::plus<ElementCompute_>,
+  bool PerColumnBias_ = false
+  ,
+  SfStrategy SfGenStrategy_ = SfStrategy::None
+>
+struct GettEpilogueParams {
+  using ElementScalar = ElementScalar_;
+  using ElementScalingFactor = ElementScalingFactor_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using TensorC = TensorC_;
+  using TensorD = TensorD_;
+  using TensorAux = TensorAux_;
+  using VectorBias = VectorBias_;
+  using VectorAlpha = VectorAlpha_;
+  using VectorBeta = VectorBeta_;
+  using TensorSFD = TensorSFD_;
+  using SFD_VectorSize = SFD_VectorSize_;
+  using ActivationFunctor = ActivationFunctor_;
+  using BiasBinaryOp = BiasBinaryOp_;
+  using EngineC = typename TensorC::engine_type;
+  using LayoutC = typename TensorC::layout_type;
+  using EngineD =  typename TensorD::engine_type;
+  using LayoutD = typename TensorD::layout_type;
+  using EngineSfD = typename TensorSFD::engine_type;
+  using LayoutSfD = typename TensorSFD::layout_type;
+  static constexpr bool PerColumnBias = PerColumnBias_;
+  static constexpr SfStrategy SfGenStrategy = SfGenStrategy_;
+  ElementScalar alpha = ElementScalar(1);
+  ElementScalar beta = ElementScalar(0);
+  TensorC C{};
+  TensorD D{};
+  VectorBias Bias{};
+  TensorAux Aux{};
+  VectorAlpha Valpha{};
+  VectorBeta Vbeta{};
+  TensorSFD SfD{};
+  ElementCompute st = ElementCompute(1);
+  ElementAccumulator* abs_max_D = nullptr;
+  ElementAccumulator* abs_max_Aux = nullptr;
+  ElementScalingFactor scale_a = ElementScalingFactor(1);
+  ElementScalingFactor scale_b = ElementScalingFactor(1);
+  ElementScalingFactor scale_c = ElementScalingFactor(1);
+  ElementScalingFactor scale_d = ElementScalingFactor(1);
+  ElementScalingFactor scale_aux = ElementScalingFactor(1);
+  bool beta_per_channel_scaling = false;
+  GettEpilogueParams() {}
+  GettEpilogueParams(ElementScalar alpha, ElementScalar beta, TensorC tensor_C, TensorD tensor_D)
+   : alpha(alpha), beta(beta), C(tensor_C), D(tensor_D) {}
+  GettEpilogueParams(ElementScalar alpha, ElementScalar beta, TensorC tensor_C, TensorD tensor_D, TensorSFD tensor_SfD, ElementCompute epilogue_st)
+   : alpha(alpha), beta(beta), C(tensor_C), D(tensor_D), SfD(tensor_SfD), st(epilogue_st) {}
+  GettEpilogueParams(
+    ElementScalar alpha, ElementScalar beta,
+    TensorC tensor_C, TensorD tensor_D,
+    VectorBias bias, TensorAux tensor_aux,
+    VectorAlpha vector_alpha, VectorBeta vector_beta)
+    : alpha(alpha), beta(beta),
+      C(tensor_C), D(tensor_D),
+      Bias(bias), Aux(tensor_aux),
+      Valpha(vector_alpha), Vbeta(vector_beta) {}
+};
+////////////////////////////////////////////////////////////////////////
+//
+// Gett Epilogue Parameters Specialization for Block Scaled GEMM kernels
+//
+////////////////////////////////////////////////////////////////////////
+template<
+  class ElementScalar_,
+  class ElementAccumulator_,
+  class ElementCompute_,
+  class TensorC_,
+  class TensorD_,
+  class TensorSfD_ = TensorD_,
+  class SFD_VectorSize_ = cute::Int<0>,
+  SfStrategy SfGenStrategy_ = SfStrategy::None
+>
+struct GettBlockScalingEpilogueParams : public GettEpilogueParams<
+    ElementScalar_,                                                                                // ElementScalar
+    ElementScalar_,                                                                                // ElementScalingFactor
+    ElementAccumulator_,                                                                           // ElementAccumulator
+    ElementCompute_,                                                                               // ElementCompute
+    TensorC_,                                                                                      // TensorC     (M, N, L)
+    TensorD_,                                                                                      // TensorD     (M, N, L)
+    decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})), // VectorBias     (M, 1)
+    decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})), // TensorAux   (M, N, L)
+    decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})), // VectorAlpha    (M, 1)
+    decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})), // VectorBeta     (M, 1)
+    cutlass::epilogue::thread::Identity<ElementCompute_>,                                          //
+    TensorSfD_,                                                                                    // TensorSfD
+    SFD_VectorSize_,                                                                               // SFD_VectorSize
+    cutlass::plus<ElementCompute_>, // class BiasBinaryOp_ =
+    false,                                                                               //PerColumnBias_
+    SfGenStrategy_                                                                       // SfGenStrategy
+  > {
+  using Base = GettEpilogueParams<
+    ElementScalar_,                                                                      // ElementScalar
+    ElementScalar_,                                                                      // ElementScalingFactor
+    ElementAccumulator_,                                                                 // ElementAccumulator
+    ElementCompute_,                                                                     // ElementCompute
+    TensorC_,                                                                            // TensorC     (M, N, L)
+    TensorD_,                                                                            // TensorD     (M, N, L)
+    decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})), // VectorBias     (M, 1)
+    decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})), // TensorAux   (M, N, L)
+    decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})), // VectorAlpha    (M, 1)
+    decltype(make_tensor(cute::recast_ptr<ElementCompute_>(nullptr), typename TensorD_::layout_type{})), // VectorBeta     (M, 1)
+    cutlass::epilogue::thread::Identity<ElementCompute_>,                                //
+    TensorSfD_,                                                                          // TensorSfD
+    SFD_VectorSize_,                                                                     // SFD_VectorSize
+    cutlass::plus<ElementCompute_>,                                                      // BiasBinaryOp
+    false,                                                                               // PerColumnBias
+    SfGenStrategy_                                                                       // SfGenStrategy
+  >;
+  using ElementScalar = typename Base::ElementScalar;
+  using ElementScalingFactor = typename Base::ElementScalingFactor;
+  using ElementAccumulator = typename Base::ElementAccumulator;
+  using ElementCompute = typename Base::ElementCompute;
+  using TensorC = typename Base::TensorC;
+  using TensorD = typename Base::TensorD;
+  using TensorAux = typename Base::TensorAux;
+  using VectorBias = typename Base::VectorBias;
+  using VectorAlpha = typename Base::VectorAlpha;
+  using VectorBeta = typename Base::VectorBeta;
+  using TensorSFD = typename Base::TensorSFD;
+  using SFD_VectorSize = typename Base::SFD_VectorSize;
+  using ActivationFunctor = typename Base::ActivationFunctor;
+  using BiasBinaryOp = typename Base::BiasBinaryOp;
+  using EngineC = typename Base::EngineC;
+  using LayoutC = typename Base::LayoutC;
+  using EngineD = typename Base::EngineD;
+  using LayoutD = typename Base::LayoutD;
+  using EngineSfD = typename Base::EngineSfD;
+  using LayoutSfD = typename Base::LayoutSfD;
+  static constexpr bool PerColumnBias = Base::PerColumnBias;
+  static constexpr SfStrategy SfGenStrategy = Base::SfGenStrategy;
+  GettBlockScalingEpilogueParams() {}
+  GettBlockScalingEpilogueParams(ElementScalar alpha, ElementScalar beta, TensorC tensor_C, TensorD tensor_D)
+   : Base(alpha, beta, tensor_C, tensor_D) {}
+  GettBlockScalingEpilogueParams(ElementScalar alpha, ElementScalar beta, TensorC tensor_C, TensorD tensor_D, TensorSFD tensor_SfD)
+   : Base(alpha, beta, tensor_C, tensor_D, tensor_SfD, ElementCompute{0}) {}
+  GettBlockScalingEpilogueParams(ElementScalar alpha, ElementScalar beta, TensorC tensor_C, TensorD tensor_D, TensorSFD tensor_SfD, ElementCompute epilogue_st)
+   : Base(alpha, beta, tensor_C, tensor_D, tensor_SfD, epilogue_st) {}
+};
+///////////////////////////////////////////////////////////
+//
+// Generic Gett 3x Implementation
+//
+///////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <int kVectorSize, class EpilogueParams, class TensorD, class TensorSFD, class ElementCompute, int kBlockM, int kBlockN>
+void compute_1d_scaling_factor_and_quantized_output(
+    EpilogueParams const& epilogue_params,
+    TensorD &tensor_D,
+    TensorSFD &tensor_SfD,
+    int64_t m,
+    int64_t n,
+    int64_t l,
+    ElementCompute (&acc)[kBlockM][kBlockN])
+{
+  using ElementD = typename ElementTraits<typename EpilogueParams::EngineD::value_type>::type;
+  using ElementSfD = typename ElementTraits<typename EpilogueParams::EngineSfD::value_type>::type;
+  int const M = cute::size<0>(tensor_D.layout());
+  int const N = cute::size<1>(tensor_D.layout());
+  int const L = cute::size<2>(tensor_D.layout());
+  auto mul = cutlass::multiplies<ElementCompute>{};
+  auto div = divides<ElementCompute>{};
+  // Get FP max
+  ElementCompute fp_max = ElementCompute(std::numeric_limits<ElementD>::max());
+  float scale_down_factor = div(1.0f, fp_max);
+  // Get st' = st / FP max
+  ElementCompute st_scaled_down = mul(epilogue_params.st, scale_down_factor);
+  absolute_value_op<ElementCompute> abs_op;
+  maximum_with_nan_propogation<ElementCompute> max_op;
+  if constexpr (cute::is_constant<1, decltype(cute::stride<0,0,1>(tensor_SfD))>::value) {
+    // MN major output
+    int const NumVecPerBlock = ceil_div(kBlockM, kVectorSize);
+    // Col major output
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      for (int v_b = 0; v_b < NumVecPerBlock; ++v_b) {
+        int64_t col = n + n_b;
+        /// Step1: get max across a vector
+        ElementCompute accum_max = ElementCompute(0);
+        for (int v = 0; v < kVectorSize; v++) {
+          int accum_row = v_b * kVectorSize + v;
+          int64_t output_row = accum_row + m;
+          if (output_row < M && col < N) {
+            accum_max = max_op(accum_max, abs_op(acc[accum_row][n_b]));
+          }
+        }
+        /// Step2: Compute Scale
+        ElementCompute pvscale = mul(accum_max, st_scaled_down);
+        ElementSfD qpvscale = static_cast<ElementSfD>(pvscale);
+        // Store the Scaling Factors
+        int64_t sf_row = m + kVectorSize * v_b;
+        if (sf_row < M && col < N) {
+          tensor_SfD(sf_row, col, l) = qpvscale;
+        }
+        /// Step3: Compute quantized output values
+        ElementCompute qpvscale_up = NumericConverter<ElementCompute, ElementSfD>{}(qpvscale);
+        // Get float reciprocal
+        ElementCompute qpvscale_rcp = div(1.0f, qpvscale_up);
+        ElementCompute acc_scale = mul(epilogue_params.st, qpvscale_rcp);
+        // Map INF to fp32::max
+        acc_scale = cutlass::minimum_with_nan_propagation<ElementCompute>{}(acc_scale, cutlass::platform::numeric_limits<ElementCompute>::max());
+        // Store the intermediate_accum
+        for (int v = 0; v < kVectorSize; v++) {
+          int accum_row = v_b * kVectorSize + v;
+          int64_t output_row = accum_row + m;
+          if (output_row < M && col < N) {
+            acc[accum_row][n_b] = mul(acc[accum_row][n_b], acc_scale);
+          }
+        }
+      }
+    }
+  }
+  else {
+    int const NumVecPerBlock = ceil_div(kBlockN, kVectorSize);
+    // row major output
+    for (int m_b = 0; m_b < kBlockM; ++m_b) {
+      for (int v_b = 0; v_b < NumVecPerBlock; ++v_b) {
+        int64_t row = m + m_b;
+        /// Step1: get max across a vector
+        ElementCompute accum_max = ElementCompute(0);
+        for (int v = 0; v < kVectorSize; v++) {
+          int accum_col = v_b * kVectorSize + v;
+          int64_t output_col = accum_col + n;
+          if (row < M && output_col < N) {
+            accum_max = max_op(accum_max, abs_op(acc[m_b][accum_col]));
+          }
+        }
+        /// Step2: Compute Scale
+        ElementCompute pvscale = mul(accum_max, st_scaled_down);
+        ElementSfD qpvscale = static_cast<ElementSfD>(pvscale);
+        // Store the Scaling Factors
+        int64_t sf_col = n + kVectorSize * v_b;
+        if (row < M && sf_col < N) {
+          tensor_SfD(row, sf_col, l) = qpvscale;
+        }
+        /// Step3: Compute quantized output values
+        ElementCompute qpvscale_up = NumericConverter<ElementCompute, ElementSfD>{}(qpvscale);
+        // Get float reciprocal
+        ElementCompute qpvscale_rcp = div(1.0f, qpvscale_up);
+        ElementCompute acc_scale = mul(epilogue_params.st, qpvscale_rcp);
+        // Map INF to fp32::max
+        acc_scale = cutlass::minimum_with_nan_propagation<ElementCompute>{}(acc_scale, cutlass::platform::numeric_limits<ElementCompute>::max());
+        // Store the intermediate_accum
+        for (int v = 0; v < kVectorSize; v++) {
+          int accum_col  = v_b * kVectorSize + v;
+          int64_t output_col = accum_col + n;
+          if (row < M && output_col < N) {
+            acc[m_b][accum_col] = mul(acc[m_b][accum_col], acc_scale);
+          }
+        }
+      }
+    }
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GETT - General Tensor-Tensor contraction reference kernel
+template <
+  class MainloopParams,
+  class EpilogueParams
+>
+void Gett(
+    MainloopParams const& mainloop_params,
+    EpilogueParams const& epilogue_params)
+{
+  static int constexpr kBlockM = 64;
+  static int constexpr kBlockN = 64;
+#if defined(_OPENMP)
+  #pragma omp parallel for collapse(3)
+#endif
+  for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
+    for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
+      for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
+        typename MainloopParams::ElementAccumulator acc[kBlockM][kBlockN];
+        gett_mainloop(mainloop_params, m, n, l, acc);
+        gett_epilogue(epilogue_params, m, n, l, acc);
+      }
+    }
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GETT - Mainloop
+template <class MainloopParams, class ElementAccumulator, int kBlockM, int kBlockN>
+void gett_mainloop(
+    MainloopParams const& mainloop_params,
+    int64_t m,
+    int64_t n,
+    int64_t l,
+    ElementAccumulator (&acc)[kBlockM][kBlockN])
+{
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
+  static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
+  using cute::raw_pointer_cast;
+  using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
+  using ElementB = typename ElementTraits<typename MainloopParams::EngineB::value_type>::type;
+  using ElementSFA = typename ElementTraits<typename MainloopParams::EngineSfA::value_type>::type;
+  using ElementSFB = typename ElementTraits<typename MainloopParams::EngineSfB::value_type>::type;
+  using RingOp = multiply_add<ElementAccumulator, ElementAccumulator, ElementAccumulator>;
+  RingOp fma_op;
+  // Zero out accumulators
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      acc[m_b][n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+    }
+  }
+  // Compute on this k-block
+  for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
+    // Load A
+    ElementAccumulator a_frag[kBlockM];
+    for (int m_b = 0; m_b < kBlockM; ++m_b) {
+      if (m + m_b < cute::size<0>(mainloop_params.A.layout())) {
+        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
+        a_frag[m_b] = static_cast<ElementAccumulator>(ElementA(mainloop_params.A(m + m_b, k, l)));
+        if constexpr (not cute::is_same_v<ElementSFA, ElementA>){
+          // Load SFA
+          auto sfa = static_cast<ElementAccumulator>(mainloop_params.SfA(m + m_b, k, l));
+          a_frag[m_b] *= sfa;
+        }
+        if (mainloop_params.transform_A == ComplexTransform::kConjugate) {
+          a_frag[m_b] = conj(a_frag[m_b]);
+        }
+      } else {
+        a_frag[m_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+      }
+    }
+    // Load B
+    ElementAccumulator b_frag[kBlockN];
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (n + n_b < cute::size<0>(mainloop_params.B.layout())) {
+        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
+        b_frag[n_b] = static_cast<ElementAccumulator>(ElementB(mainloop_params.B(n + n_b, k, l)));
+        if constexpr (not cute::is_same_v<ElementSFB, ElementB>){
+          // Load SFB
+          auto sfb = static_cast<ElementAccumulator>(mainloop_params.SfB(n + n_b, k, l));
+          b_frag[n_b] *= sfb;
+        }
+        if (mainloop_params.transform_B == ComplexTransform::kConjugate) {
+          b_frag[n_b] = conj(b_frag[n_b]);
+        }
+      } else {
+        b_frag[n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
+      }
+    }
+    // do compute
+    for (int m_b = 0; m_b < kBlockM; ++m_b) {
+      for (int n_b = 0; n_b < kBlockN; ++n_b) {
+        acc[m_b][n_b] = fma_op(a_frag[m_b], b_frag[n_b], acc[m_b][n_b]);
+      }
+    }
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GETT - Epilogue
+template <class EpilogueParams, class ElementAccumulator, int kBlockM, int kBlockN>
+void gett_epilogue(
+    EpilogueParams const& epilogue_params,
+    int64_t m,
+    int64_t n,
+    int64_t l,
+    ElementAccumulator (&acc)[kBlockM][kBlockN])
+{
+  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == 3, "M, K, B");
+  static_assert(cute::rank(typename EpilogueParams::LayoutD{}) == 3, "N, K, B");
+  using cute::raw_pointer_cast;
+  using ElementCompute = typename EpilogueParams::ElementCompute;
+  using ElementC = typename EpilogueParams::TensorC::value_type;
+  using ElementD = typename EpilogueParams::TensorD::value_type;
+  using ElementSfD = typename EpilogueParams::TensorSFD::value_type;
+  using ElementAux = typename EpilogueParams::TensorAux::value_type;
+  using ElementBias = typename EpilogueParams::VectorBias::value_type;
+  using ElementScalar = typename EpilogueParams::ElementScalar;
+  using ElementScalingFactor = typename EpilogueParams::ElementScalingFactor;
+  using ActivationFunctor = typename EpilogueParams::ActivationFunctor;
+  using BiasBinaryOp = typename EpilogueParams::BiasBinaryOp;
+  constexpr bool PerColBias = EpilogueParams::PerColumnBias;
+  constexpr SfStrategy SfGenStrategy = EpilogueParams::SfGenStrategy;
+  constexpr bool IsScalingAndAmaxOutputNeeded =
+      cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
+      cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
+  constexpr bool IsScalingAndAmaxAuxOutputNeeded =
+      cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
+      cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
+  constexpr bool IsReLUAuxNeeded =
+      (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ReLu<ElementCompute>> or
+       cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) and
+      cute::is_same_v<ElementAux, cutlass::uint1b_t>;
+  constexpr bool UseReLU =
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>; // Treat Clamp as ReLU
+  constexpr bool IsBackpropFusion =
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dGELU<ElementCompute>> or
+      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dReLU<ElementCompute>>;
+  // Input related converter
+  NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
+  NumericConverter<ElementCompute, ElementC> source_converter;
+  NumericConverter<ElementCompute, ElementBias> bias_converter;
+  [[maybe_unused]] NumericConverter<ElementCompute, ElementAux> aux_source_converter;
+  // Scale related converter
+  NumericConverter<ElementCompute, ElementScalar> scale_converter;
+  NumericConverter<ElementCompute, ElementScalingFactor> scaling_factor_converter;
+  // Abs max converter
+  [[maybe_unused]] NumericConverter<ElementAccumulator, ElementCompute> abs_max_output_converter;
+  // Output related converter
+  NumericConverter<ElementD, ElementCompute> destination_converter;
+  [[maybe_unused]] NumericConverter<ElementAux, ElementCompute> aux_destination_converter;
+  NumericConverter<ElementBias, ElementCompute> dBias_converter;
+  // Epilogue operations
+  multiply_add<ElementCompute, ElementCompute, ElementCompute> epilogue_fma;
+  multiplies<ElementCompute> mul;
+  plus<ElementCompute> add;
+  // Activation operation
+  ActivationFunctor activation;
+  // Bias binary operation
+  BiasBinaryOp bias_op;
+  // Do conversion
+  ElementCompute converted_alpha = scale_converter(epilogue_params.alpha);
+  ElementCompute converted_beta = scale_converter(epilogue_params.beta);
+  ElementCompute converted_scale_a = scaling_factor_converter(epilogue_params.scale_a);
+  ElementCompute converted_scale_b = scaling_factor_converter(epilogue_params.scale_b);
+  ElementCompute converted_scale_c = scaling_factor_converter(epilogue_params.scale_c);
+  ElementCompute converted_scale_d = scaling_factor_converter(epilogue_params.scale_d);
+  ElementCompute converted_scale_aux = scaling_factor_converter(epilogue_params.scale_aux);
+  // Init local var
+  [[maybe_unused]] ElementCompute local_abs_max_output = ElementCompute(0);
+  [[maybe_unused]] ElementCompute local_abs_max_aux_output = ElementCompute(0);
+  converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
+  converted_beta = mul(converted_beta, converted_scale_c);
+  ElementCompute inter_accum[kBlockM][kBlockN];
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    ElementCompute local_dBias = ElementCompute(0);
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
+        // Convert every type to ElementCompute first, do compute, convert to output type, write it out
+        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
+        // vector alpha
+        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
+          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b, n + n_b, l));
+          converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
+        }
+        ElementCompute output = mul(converted_alpha, converted_acc);
+        if (raw_pointer_cast(epilogue_params.Bias.data()) && not IsBackpropFusion) {
+          ElementCompute converted_bias = bias_converter(epilogue_params.Bias(PerColBias ? n + n_b : m + m_b));
+          output = bias_op(output, converted_bias);
+        }
+        if (raw_pointer_cast(epilogue_params.C.data())) {
+          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
+          // vector beta
+          if (epilogue_params.Vbeta.data()) {
+            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b, n + n_b, l));
+            converted_beta = mul(converted_beta, converted_scale_c);
+          }
+          output = epilogue_fma(converted_beta, converted_src, output);
+        }
+        if constexpr (IsBackpropFusion) {
+          ElementAux aux_input = ElementAux(0);
+          if (raw_pointer_cast(epilogue_params.Aux.data())) {
+            aux_input = epilogue_params.Aux(m + m_b, n + n_b, l);
+          }
+          output = activation(output, aux_source_converter(aux_input));
+          local_dBias = add(local_dBias, output);
+        }
+        else {
+          if (raw_pointer_cast(epilogue_params.Aux.data())) {
+            auto aux_output = output;
+            if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
+              maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+              local_abs_max_aux_output = amax_op(local_abs_max_aux_output, aux_output);
+              aux_output = epilogue_fma(converted_scale_aux, aux_output, ElementCompute(0));
+            }
+            if constexpr (IsReLUAuxNeeded) {
+              epilogue_params.Aux(m + m_b, n + n_b, l) = not (aux_output < 0) ? uint1b_t(1) : uint1b_t(0);
+            } else {
+              epilogue_params.Aux(m + m_b, n + n_b, l) = aux_destination_converter(aux_output);
+            }
+          }
+          if constexpr (UseReLU) {
+            cutlass::epilogue::thread::ReLU<ElementCompute> relu;
+            output = relu(output);
+          }
+          else {
+            output = activation(output);
+          }
+        }
+        if constexpr (IsScalingAndAmaxOutputNeeded) {
+          maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+          local_abs_max_output = amax_op(local_abs_max_output, output);
+          output = epilogue_fma(converted_scale_d, output, ElementCompute(0));
+        }
+        inter_accum[m_b][n_b] = ElementCompute(output);
+      }
+    } // n_b
+    if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n < cute::size<1>(epilogue_params.D.layout())) {
+      if (raw_pointer_cast(epilogue_params.Bias.data()) && IsBackpropFusion) {
+        ElementCompute converted_dBias = bias_converter(epilogue_params.Bias(m + m_b));
+        local_dBias = add(local_dBias, converted_dBias);
+        epilogue_params.Bias(m + m_b) = dBias_converter(local_dBias);
+      }
+    }
+  } // m_b
+  if constexpr (
+                SfGenStrategy == SfStrategy::SfDGen
+               ) {
+    // 1d scale factor generation
+    constexpr int kVectorSize = typename EpilogueParams::SFD_VectorSize{};
+    if (epilogue_params.SfD.data() != nullptr) {
+      compute_1d_scaling_factor_and_quantized_output<kVectorSize>(epilogue_params, epilogue_params.D, epilogue_params.SfD, m, n, l, inter_accum);
+    }
+  }
+  for (int m_b = 0; m_b < kBlockM; ++m_b) {
+    for (int n_b = 0; n_b < kBlockN; ++n_b) {
+      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
+        epilogue_params.D(m + m_b, n + n_b, l) = destination_converter(inter_accum[m_b][n_b]);
+      }
+    }
+  }
+#if defined(_OPENMP)
+  #pragma omp critical(Abs_Max_Data_Update)
+#endif
+  {
+    if constexpr (IsScalingAndAmaxOutputNeeded) {
+      if (epilogue_params.abs_max_D) {
+        *epilogue_params.abs_max_D = maximum_with_nan_propogation<ElementAccumulator>{}(
+          *epilogue_params.abs_max_D, abs_max_output_converter(local_abs_max_output));
+      }
+    }
+    if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
+      if (epilogue_params.abs_max_Aux) {
+        *epilogue_params.abs_max_Aux = maximum_with_nan_propogation<ElementAccumulator>{}(
+            *epilogue_params.abs_max_Aux, abs_max_output_converter(local_abs_max_aux_output));
+      }
+    }
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <class TensorType>
+auto make_layout_rank3(const TensorType& tensor) {
+  // append a batch mode of size 1 if we do not have tensors that are rank 3
+  return make_layout(
+      make_shape(cute::get<0>(tensor.shape()), cute::get<1>(tensor.shape()), cute::Int<1>{}),
+      make_stride(cute::get<0>(tensor.stride()), cute::get<1>(tensor.stride()), int64_t(cosize(tensor.layout()))));
+}
+/// GEMM - General Matrix-Matrix contraction without conjugation options
+template <
+  class MainloopParams,
+  class EpilogueParams
+>
+void Gemm3x(
+    MainloopParams const& mainloop_params,
+    EpilogueParams const& epilogue_params)
+{
+  using namespace cute;
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename MainloopParams::LayoutB{}));
+  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == cute::rank(typename EpilogueParams::LayoutD{}));
+  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename EpilogueParams::LayoutC{}));
+  if constexpr (cute::rank(typename MainloopParams::LayoutA{}) == 2) {
+    cute::Layout layout_A = make_layout_rank3(mainloop_params.A);
+    cute::Layout layout_B = make_layout_rank3(mainloop_params.B);
+    cute::Layout layout_C = make_layout_rank3(epilogue_params.C);
+    cute::Layout layout_D = make_layout_rank3(epilogue_params.D);
+    cute::Layout layout_Aux = make_layout_rank3(epilogue_params.Aux);
+    cute::Layout layout_Bias = make_layout_rank3(epilogue_params.Bias);
+    cute::Layout layout_Valpha = make_layout_rank3(epilogue_params.Valpha);
+    cute::Layout layout_Vbeta = make_layout_rank3(epilogue_params.Vbeta);
+    auto TensorA = make_tensor(mainloop_params.A.data(), layout_A);
+    auto TensorB = make_tensor(mainloop_params.B.data(), layout_B);
+    auto TensorC = make_tensor(epilogue_params.C.data(), layout_C);
+    auto TensorD = make_tensor(epilogue_params.D.data(), layout_D);
+    auto TensorAux = make_tensor(epilogue_params.Aux.data(), layout_Aux);
+    auto VectorBias = make_tensor(epilogue_params.Bias.data(), layout_Bias);
+    auto VectorAlpha = make_tensor(epilogue_params.Valpha.data(), layout_Valpha);
+    auto VectorBeta = make_tensor(epilogue_params.Vbeta.data(), layout_Vbeta);
+    // Reconstruct mainloop params
+    GettMainloopParams<typename MainloopParams::ElementAccumulator,
+                       decltype(TensorA),
+                       decltype(TensorB)>
+        mainloop_params_converted{TensorA,
+                                  TensorB,
+                                  mainloop_params.transform_A,
+                                  mainloop_params.transform_B};
+    // Reconstruct epilogue params
+    GettEpilogueParams<typename EpilogueParams::ElementScalar,
+                       typename EpilogueParams::ElementScalingFactor,
+                       typename EpilogueParams::ElementAccumulator,
+                       typename EpilogueParams::ElementCompute,
+                       decltype(TensorC),
+                       decltype(TensorD),
+                       decltype(VectorBias),
+                       decltype(TensorAux),
+                       decltype(VectorAlpha),
+                       decltype(VectorBeta)
+                      >
+        epilogue_params_converted{epilogue_params.alpha,
+                                  epilogue_params.beta,
+                                  TensorC,
+                                  TensorD,
+                                  VectorBias,
+                                  TensorAux,
+                                  VectorAlpha,
+                                  VectorBeta,
+                                  epilogue_params.abs_amax_D,
+                                  epilogue_params.abs_amax_Aux,
+                                  epilogue_params.scale_a,
+                                  epilogue_params.scale_b,
+                                  epilogue_params.scale_c,
+                                  epilogue_params.scale_d,
+                                  epilogue_params.scale_aux
+                                  };
+    Gett(mainloop_params_converted, epilogue_params_converted);
+  }
+  else {
+    // if we already have a batch mode, just pass it through
+    Gett(mainloop_params, epilogue_params);
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // cutlass::reference::host
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/rank_2k.h ADDED Viewed

	@@ -0,0 +1,261 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for Rank 2k update in host-side code.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_rank2k(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2,
+    "Tensors must be of rank 2");
+  static_assert(
+    FillModeC == FillMode::kLower ||
+    FillModeC == FillMode::kUpper,
+    "Fill Mode can either be Lower or Upper.");
+  using CompareOp = typename platform::conditional<(FillModeC == FillMode::kLower),
+                                                    std::greater_equal<int>,
+                                                    std::less_equal<int>>::type;
+  // Note: batch is ignored.
+  // Note: M is same as N for Rank 2k update
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  // Blocking necessary to speedup reference implementation
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+  for (int row_block = 0; row_block < N; row_block += Nblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+      ComputeType accum[Nblock][Nblock];
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Nblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Nblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            if (row < N && col < N && compare_op(row, col))
+            {
+              // A x B^T
+              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+              ElementB b_t = tensor_b.at(MatrixCoord(col, k_block));
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b_t(cast_if_scalar<ComputeType>(b_t));
+              accum[i][j] = inner_product_op(compute_a, compute_b_t, accum[i][j]);
+              // B x A^T
+              ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+              ElementA a_t = tensor_a.at(MatrixCoord(col, k_block));
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+              ComputeType compute_a_t(cast_if_scalar<ComputeType>(a_t));
+              accum[i][j] = inner_product_op(compute_b, compute_a_t, accum[i][j]);
+            }
+          }
+        }
+      }
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Nblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+          MatrixCoord coord = MatrixCoord(row, col);
+          if (row < N && col < N &&
+              ( (FillModeC == FillMode::kLower && row >= col) ||
+                (FillModeC == FillMode::kUpper && row <= col) )
+          ) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general Rank 2k update (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_rank2k(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  ComputeType initial_accum) {
+  compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
+      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+      initial_accum);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Rank2K;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA,
+          typename ElementB, typename LayoutB,
+          typename ElementC, typename LayoutC, FillMode FillModeC,
+          typename ScalarType, typename ComputeType>
+struct Rank2K<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h ADDED Viewed

	@@ -0,0 +1,318 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <cassert>
+namespace cutlass {
+namespace reference {
+namespace host {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
+  assert(M==N);
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+        ComputeType accum[Mblock][Nblock];
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) ||
+                  (fill_mode_c == FillMode::kUpper && row <= col) )
+                ) {
+                // A x B^T (Symmetric) or A x B^H (Hermitian)
+                // complex conjugation on operandB (b_t) is function of blas3 computation
+                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+                ElementB b_t = (blas_mode == BlasMode::kHermitian) ?
+                              conj(tensor_b.at(MatrixCoord(col, k_block))) :
+                              tensor_b.at(MatrixCoord(col, k_block));
+                ComputeType a_ik = ComputeType(a);
+                ComputeType b_jk = ComputeType(b_t);
+                // complex conjugation is a function of operand layouts
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_ik = conj(a_ik);
+                }
+                // complex conjugation is a function of operand layouts
+                if (transform_b == ComplexTransform::kConjugate) {
+                  b_jk = conj(b_jk);
+                }
+                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+              }
+            }
+          }
+        }
+        /* HER2K need two epilogues to handle complex alpha value */
+        if ( blas_mode == BlasMode::kHermitian ) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+              MatrixCoord coord = MatrixCoord(row, col);
+              if (row < M && col < N &&
+                  ((fill_mode_c == FillMode::kLower && row >= col) ||
+                  (fill_mode_c == FillMode::kUpper && row <= col))
+                ) {
+                ScalarType c = tensor_c.at(coord);
+                // The imaginary parts of the diagonal elements of
+                // a complex data type are assumed and set to zero
+                if (blas_mode == BlasMode::kHermitian) {
+                  c = (row == col) ? real(c) : c;
+                }
+                tensor_d.at(coord) = convert_op(alpha *
+                  ScalarType(accum[i][j]) +
+                  beta * c);
+              }
+            }
+          }
+          /* Zeoring out accum for second HERK */
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              accum[i][j] = initial_accum;
+            }
+          }
+        }
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) ||
+                  (fill_mode_c == FillMode::kUpper && row <= col) )
+                ) {
+                // B x A^T (Symmetric) or B x A^H (Hermitian)
+                // complex conjugation on operandB (a_t) is function of blas3 computation
+                ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+                ElementA a_t = (blas_mode == BlasMode::kHermitian) ?
+                                conj(tensor_a.at(MatrixCoord(col, k_block))):
+                                tensor_a.at(MatrixCoord(col, k_block));
+                ComputeType b_ik = ComputeType(b);
+                ComputeType a_jk = ComputeType(a_t);
+                // complex conjugation here is a function of operand layouts
+                if (transform_b == ComplexTransform::kConjugate) {
+                  b_ik = conj(b_ik);
+                }
+                // complex conjugation here is a function of operand layouts
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_jk = conj(a_jk);
+                }
+                accum[i][j] = inner_product_op(b_ik, a_jk, accum[i][j]);
+              }
+            }
+          }
+        }
+        ScalarType alpha_hermitian = (blas_mode == BlasMode::kHermitian) ?
+                                      conj(alpha) : alpha;
+        ScalarType beta_hermitian = (blas_mode == BlasMode::kHermitian) ?
+                                      1 : beta;
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            MatrixCoord coord = MatrixCoord(row, col);
+            if (row < M && col < N &&
+                ((fill_mode_c == FillMode::kLower && row >= col) ||
+                 (fill_mode_c == FillMode::kUpper && row <= col))
+              ) {
+              ScalarType d = (blas_mode == BlasMode::kHermitian) ?
+                             tensor_d.at(coord) : tensor_c.at(coord);
+              ScalarType tmp_d = convert_op(
+                alpha_hermitian * ScalarType(accum[i][j]) +
+                beta_hermitian * d);
+              if (blas_mode == BlasMode::kHermitian && row == col ) {
+                tensor_d.at(coord) = real(tmp_d);
+              } else {
+                tensor_d.at(coord) = tmp_d;
+              }
+            }
+          }
+        }
+      } // for (col_block)
+    } // for (row_block)
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_b.add_pointer_offset(batch_stride_B);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+  } // for (batch_idx)
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+  Rank2KComplex(
+    problem_size, alpha,
+    tensor_a, transform_a,
+    tensor_b, transform_b,
+    beta, tensor_c, tensor_d,
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/rank_k_complex.h ADDED Viewed

	@@ -0,0 +1,234 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <cassert>
+namespace cutlass {
+namespace reference {
+namespace host {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
+  assert(M==N);
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+        ComputeType accum[Mblock][Nblock];
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) ||
+                  (fill_mode_c == FillMode::kUpper && row <= col) )
+                ) {
+                // A x A^T (Symmetric) or A x A^H (Hermitian)
+                // complex conjugation on operandB (a_t) (function of blas3 computation)
+                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+                ElementA a_t = (blas_mode == BlasMode::kHermitian) ?
+                              conj(tensor_a.at(MatrixCoord(col, k_block))) :
+                              tensor_a.at(MatrixCoord(col, k_block));
+                ComputeType a_ik = ComputeType(a);
+                ComputeType b_jk = ComputeType(a_t);
+                // complex conjugation (function of input layouts)
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_ik = conj(a_ik);
+                }
+                // complex conjugation (function of input layouts)
+                if (transform_a == ComplexTransform::kConjugate) {
+                  b_jk = conj(b_jk);
+                }
+                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+              }
+            }
+          }
+        }
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            MatrixCoord coord = MatrixCoord(row, col);
+            if (row < M && col < N &&
+                ((fill_mode_c == FillMode::kLower && row >= col) ||
+                 (fill_mode_c == FillMode::kUpper && row <= col))
+              ) {
+              ScalarType c = tensor_c.at(coord);
+              // The imaginary parts of the diagonal elements of
+              // a complex data type are assumed and set to zero
+              if (blas_mode == BlasMode::kHermitian) {
+                c = (row == col) ? real(c) : c;
+              }
+              ScalarType tmp_d = convert_op(
+                alpha * ScalarType(accum[i][j]) +
+                beta * c);
+              if (blas_mode == BlasMode::kHermitian && row == col ) {
+                tensor_d.at(coord) = real(tmp_d);
+              } else {
+                tensor_d.at(coord) = tmp_d;
+              }
+            }
+          }
+        }
+      } // for (col_block)
+    } // for (row_block)
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+  } // for (batch_idx)
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void RankKComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+  Rank2KComplex(
+    problem_size, alpha,
+    tensor_a, transform_a,
+    beta, tensor_c, tensor_d,
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/symm.h ADDED Viewed

	@@ -0,0 +1,285 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for SYMM update in host-side code.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2,
+    "Tensors must be of rank 2");
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+  static_assert(
+    FillModeA == FillMode::kLower ||
+    FillModeA == FillMode::kUpper,
+    "Fill Mode can either be Lower or Upper.");
+  using CompareOp_w_diag =  typename TrMatrixCompareOp<FillModeA, DiagType::kNonUnit>::Type;
+  using CompareOp_wo_diag = typename TrMatrixCompareOp<FillModeA, DiagType::kZero>::Type;
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp_w_diag compare_op_1;
+  CompareOp_wo_diag compare_op_2;
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+      ComputeType accum[Mblock][Nblock];
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            if (row < M && col < N) {
+              ElementA a_1 = ElementA();
+              ElementB b_1 = ElementB();
+              ElementA a_2 = ElementA();
+              ElementB b_2 = ElementB();
+              // A x B or B x A (with diagonal)
+              if (SideModeA == SideMode::kLeft) {
+                a_1 = (compare_op_1(row, k_block)) ?
+                      (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
+                b_1 = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a_1 = tensor_b.at(MatrixCoord(row, k_block));
+                b_1 = (compare_op_1(k_block, col)) ?
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
+              }
+              ComputeType compute_a_1(cast_if_scalar<ComputeType>(a_1));
+              ComputeType compute_b_1(cast_if_scalar<ComputeType>(b_1));
+              accum[i][j] = inner_product_op(compute_a_1, compute_b_1, accum[i][j]);
+              // A^T x B or B x A^T (without diagonal)
+              if (SideModeA == SideMode::kLeft) {
+                a_2 = (compare_op_2(k_block, row)) ?
+                      (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
+                b_2 = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a_2 = tensor_b.at(MatrixCoord(row, k_block));
+                b_2 = (compare_op_2(col, k_block)) ?
+                      tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
+              }
+              ComputeType compute_a_2(cast_if_scalar<ComputeType>(a_2));
+              ComputeType compute_b_2(cast_if_scalar<ComputeType>(b_2));
+              accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
+            }
+          }
+        }
+      }
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+          MatrixCoord coord = MatrixCoord(row, col);
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general Symm update (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  ComputeType initial_accum) {
+  compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
+      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+      initial_accum);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Symm;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA,
+          SideMode SideModeA, FillMode FillModeA,
+          typename ElementB, typename LayoutB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/symm_complex.h ADDED Viewed

	@@ -0,0 +1,319 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued SYMM update in host-side code.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <cassert>
+namespace cutlass {
+namespace reference {
+namespace host {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  BlasMode BlasMode_ = BlasMode::kSymmetric,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm_complex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static BlasMode const kBlasMode  = BlasMode_;
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  static_assert(kSideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+  static_assert(
+    kFillModeA == FillMode::kLower ||
+    kFillModeA == FillMode::kUpper,
+    "Fill Mode can either be Lower or Upper.");
+  using CompareOp_w_diag =  typename TrMatrixCompareOp<kFillModeA, DiagType::kNonUnit>::Type;
+  using CompareOp_wo_diag = typename TrMatrixCompareOp<kFillModeA, DiagType::kZero>::Type;
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp_w_diag compare_op_1;
+  CompareOp_wo_diag compare_op_2;
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+        ComputeType accum[Mblock][Nblock];
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+              if (row < M && col < N)
+              {
+                ElementA a_1 = ElementA();
+                ElementB b_1 = ElementB();
+                ElementA a_2 = ElementA();
+                ElementB b_2 = ElementB();
+                // A x B or B x A (with diagonal)
+                if (kSideModeA == SideMode::kLeft) {
+                  a_1 = (compare_op_1(row, k_block)) ?
+                        (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
+                  b_1 = tensor_b.at(MatrixCoord(k_block, col));
+                } else if (kSideModeA == SideMode::kRight) {
+                  a_1 = tensor_b.at(MatrixCoord(row, k_block));
+                  b_1 = (compare_op_1(k_block, col)) ?
+                        tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
+                }
+                ComputeType compute_a_1 = ComputeType(a_1);
+                ComputeType compute_b_1 = ComputeType(b_1);
+                // The imaginary parts of the diagonal elements of
+                // a complex data type are assumed and set to zero
+                if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kLeft && row == k_block) {
+                  compute_a_1 = real(compute_a_1);
+                } else if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kRight && k_block == col) {
+                  compute_b_1 = real(compute_b_1);
+                }
+                accum[i][j] = inner_product_op(compute_a_1, compute_b_1,  accum[i][j]);
+                // A^T x B or B x A^T (without diagonal)
+                if (kSideModeA == SideMode::kLeft) {
+                  a_2 = (compare_op_2(k_block, row)) ?
+                        (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
+                  b_2 = tensor_b.at(MatrixCoord(k_block, col));
+                  if (kBlasMode == BlasMode::kHermitian)
+                    a_2 = conj(a_2);
+                } else if (kSideModeA == SideMode::kRight) {
+                  a_2 = tensor_b.at(MatrixCoord(row, k_block));
+                  b_2 = (compare_op_2(col, k_block)) ?
+                        tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
+                  if (kBlasMode == BlasMode::kHermitian)
+                    b_2 = conj(b_2);
+                }
+                ComputeType compute_a_2 = ComputeType(a_2);
+                ComputeType compute_b_2 = ComputeType(b_2);
+                accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
+              }
+            }
+          }
+        }
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            MatrixCoord coord = MatrixCoord(row, col);
+            if (row < M && col < N) {
+              ScalarType c = tensor_c.at(coord);
+              tensor_d.at(coord) = convert_op(
+                alpha * ScalarType(accum[i][j]) +
+                beta * c);
+            }
+          }
+        }
+      } // for (col_block)
+    } // for (row_block)
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_b.add_pointer_offset(batch_stride_B);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+  } // for (batch_idx)
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  BlasMode BlasMode_ = cutlass::BlasMode::kSymmetric,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
+>
+struct SymmComplex;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA,
+          SideMode SideModeA, FillMode FillModeA,
+          typename ElementB, typename LayoutB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType,
+          BlasMode BlasMode_>
+struct SymmComplex<ElementA, LayoutA,
+                   SideModeA, FillModeA,
+                   ElementB, LayoutB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, BlasMode_,
+                   arch::OpMultiplyAddComplex> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_symm_complex<ElementA, LayoutA,
+                 SideModeA, FillModeA,
+                 ElementB, LayoutB,
+                 ElementC, LayoutC,
+                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for gaussian multiply-add
+template <typename ElementA, typename LayoutA,
+          SideMode SideModeA, FillMode FillModeA,
+          typename ElementB, typename LayoutB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType,
+          BlasMode BlasMode_>
+struct SymmComplex<ElementA, LayoutA,
+                   SideModeA, FillModeA,
+                   ElementB, LayoutB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, BlasMode_,
+                   arch::OpMultiplyAddGaussianComplex> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_symm_complex<ElementA, LayoutA,
+                 SideModeA, FillModeA,
+                 ElementB, LayoutB,
+                 ElementC, LayoutC,
+                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_compare.h ADDED Viewed

	@@ -0,0 +1,616 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+#pragma once
+// Standard Library includes
+#include <utility>
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/relatively_equal.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/tensor_view_planar_complex.h"
+#include "cutlass/util/distribution.h"
+#include "tensor_foreach.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorGreatestErrorFunc {
+  //
+  // Data members
+  //
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  double result;
+  /// Ctor
+  TensorGreatestErrorFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    result(0.0) { }
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+    result = std::max(result, std::abs(double(lhs_) - double(rhs_)));
+  }
+  /// Returns true if equal
+  operator double() const {
+    return result;
+  }
+};
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorMREFunc {
+  //
+  // Data members
+  //
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  double sum;
+  uint64_t count;
+  static constexpr double epsilon = 1e-6;
+  /// Ctor
+  TensorMREFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    sum(0.0),
+    count(0) { }
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+    sum += std::abs(double(lhs_) - double(rhs_) / (double(rhs_) + epsilon));
+    ++count;
+  }
+  /// Returns true if equal
+  operator double() const {
+    return sum / double(count);
+  }
+};
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorMSEFunc {
+  //
+  // Data members
+  //
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  double sum;
+  uint64_t count;
+  /// Ctor
+  TensorMSEFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    sum(0.0),
+    count(0) { }
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+    sum += std::pow((double(lhs_) - double(rhs_)), 2);
+    ++count;
+  }
+  /// Returns true if equal
+  operator double() const {
+    return sum / double(count);
+  }
+};
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorEqualsFunc {
+  //
+  // Data members
+  //
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  bool result;
+  /// Ctor
+  TensorEqualsFunc(): result(true) { }
+  /// Ctor
+  TensorEqualsFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_
+  ) :
+    lhs(lhs_), rhs(rhs_), result(true) { }
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+    if (lhs_ != rhs_) {
+      result = false;
+    }
+  }
+  /// Returns true if equal
+  operator bool() const {
+    return result;
+  }
+};
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorRelativelyEqualsFunc {
+  //
+  // Data members
+  //
+  TensorView<Element, Layout> lhs;
+  TensorView<Element, Layout> rhs;
+  Element epsilon;
+  Element nonzero_floor;
+  bool result;
+  /// Ctor
+  TensorRelativelyEqualsFunc(
+    TensorView<Element, Layout> const &lhs_,
+    TensorView<Element, Layout> const &rhs_,
+    Element epsilon_,
+    Element nonzero_floor_
+  ) :
+    lhs(lhs_),
+    rhs(rhs_),
+    epsilon(epsilon_),
+    nonzero_floor(nonzero_floor_),
+    result(true) { }
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+    Element lhs_ = lhs.at(coord);
+    Element rhs_ = rhs.at(coord);
+    if (!relatively_equal(lhs_, rhs_, epsilon, nonzero_floor)) {
+      result = false;
+    }
+  }
+  /// Returns true if equal
+  operator bool() const {
+    return result;
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns the Mean Squared Error between two tensors.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+double TensorMSE(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return -1;
+  }
+  detail::TensorMSEFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+  return double(func);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns the Mean Relative Error between two tensors.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+double TensorMRE(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return -1;
+  }
+  detail::TensorMREFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+  return double(func);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns the greatest error between two tensors.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+double TensorGreatestError(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return -1;
+  }
+  detail::TensorGreatestErrorFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+  return double(func);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns true if two tensor views are equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorEquals(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+  detail::TensorEqualsFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+  return bool(func);
+}
+/// Returns true if two tensor views are equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorEquals(
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
+  TensorViewPlanarComplex<Element, Layout> const &rhs) {
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+  detail::TensorEqualsFunc<Element, Layout> real_func(
+    {lhs.data(), lhs.layout(), lhs.extent()},
+    {rhs.data(), rhs.layout(), rhs.extent()}
+  );
+  TensorForEach(
+    lhs.extent(),
+    real_func
+  );
+  if (!bool(real_func)) {
+    return false;
+  }
+  detail::TensorEqualsFunc<Element, Layout> imag_func(
+    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()},
+    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()}
+    );
+  TensorForEach(
+    lhs.extent(),
+    imag_func
+  );
+  return bool(imag_func);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns true if two tensor views are relatively equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorRelativelyEquals(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs,
+  Element epsilon,
+  Element nonzero_floor) {
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+  detail::TensorRelativelyEqualsFunc<Element, Layout> func(lhs, rhs, epsilon, nonzero_floor);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+  return bool(func);
+}
+/// Returns true if two tensor views are relatively equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorRelativelyEquals(
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
+  TensorViewPlanarComplex<Element, Layout> const &rhs,
+  Element epsilon,
+  Element nonzero_floor) {
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return false;
+  }
+  detail::TensorRelativelyEqualsFunc<Element, Layout> real_func(
+    {lhs.data(), lhs.layout(), lhs.extent()},
+    {rhs.data(), rhs.layout(), rhs.extent()},
+    epsilon,
+    nonzero_floor
+  );
+  TensorForEach(
+    lhs.extent(),
+    real_func
+  );
+  if (!bool(real_func)) {
+    return false;
+  }
+  detail::TensorEqualsFunc<Element, Layout> imag_func(
+    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()},
+    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()},
+    epsilon,
+    nonzero_floor
+  );
+  TensorForEach(
+    lhs.extent(),
+    imag_func
+  );
+  return bool(imag_func);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns true if two tensor views are NOT equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorNotEquals(
+  TensorView<Element, Layout> const &lhs,
+  TensorView<Element, Layout> const &rhs) {
+  // Extents must be identical
+  if (lhs.extent() != rhs.extent()) {
+    return true;
+  }
+  detail::TensorEqualsFunc<Element, Layout> func(lhs, rhs);
+  TensorForEach(
+    lhs.extent(),
+    func
+  );
+  return !bool(func);
+}
+/// Returns true if two tensor views are equal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorNotEquals(
+  TensorViewPlanarComplex<Element, Layout> const &lhs,
+  TensorViewPlanarComplex<Element, Layout> const &rhs) {
+  return !TensorEquals(lhs, rhs);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorContainsFunc {
+  //
+  // Data members
+  //
+  TensorView<Element, Layout> view;
+  Element value;
+  bool contains;
+  Coord<Layout::kRank> location;
+  //
+  // Methods
+  //
+  /// Ctor
+  TensorContainsFunc(): contains(false) { }
+  /// Ctor
+  TensorContainsFunc(
+    TensorView<Element, Layout> const &view_,
+    Element value_
+  ) :
+    view(view_), value(value_), contains(false) { }
+  /// Visits a coordinate
+  void operator()(Coord<Layout::kRank> const &coord) {
+    if (view.at(coord) == value) {
+      if (!contains) {
+        location = coord;
+      }
+      contains = true;
+    }
+  }
+  /// Returns true if equal
+  operator bool() const {
+    return contains;
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns true if a value is present in a tensor
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+bool TensorContains(
+  TensorView<Element, Layout> const & view,
+  Element value) {
+  detail::TensorContainsFunc<Element, Layout> func(
+    view,
+    value
+  );
+  TensorForEach(
+    view.extent(),
+    func
+  );
+  return bool(func);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns a pair containing a boolean of whether a value exists in a tensor and the location of
+/// of the first occurrence. If the value is not contained in the tensor, the second element of the
+/// pair is undefined.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+std::pair<bool, Coord<Layout::kRank> > TensorFind(
+  TensorView<Element, Layout> const & view,
+  Element value) {
+  detail::TensorContainsFunc<Element, Layout> func(
+    view,
+    value
+  );
+  TensorForEach(
+    view.extent(),
+    func
+  );
+  return std::make_pair(bool(func), func.location);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp ADDED Viewed

	@@ -0,0 +1,101 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+#pragma once
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+// Cute includes
+#include "cute/tensor.hpp"
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns true if two tensor views are equal.
+template <
+  typename TensorL,
+  typename TensorR
+>
+bool TensorEquals(
+  TensorL lhs,
+  TensorR rhs) {
+  // Extents must be identical
+  if (cute::size(lhs) != cute::size(rhs)) {
+    return false;
+  }
+  for (int64_t idx = 0; idx < cute::size(lhs); ++idx) {
+    if (lhs(idx) != rhs(idx)) {
+      return false;
+    }
+  }
+  return true;
+}
+/// Returns true if two tensor views are NOT equal.
+template <
+  typename TensorL,
+  typename TensorR
+>
+bool TensorNotEquals(
+  TensorL lhs,
+  TensorR rhs) {
+  return TensorEquals(lhs, rhs);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_copy.h ADDED Viewed

	@@ -0,0 +1,256 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+#pragma once
+// Standard Library includes
+#include <utility>
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "tensor_foreach.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Helper to convert between types
+template <
+  typename DstElement,
+  typename SrcElement
+>
+struct TrivialConvert {
+  TrivialConvert() { }
+  DstElement operator()(SrcElement src) const {
+    return DstElement(src);
+  }
+};
+/// Helper to conditionally copy between tensor views.
+template <
+  typename DstElement,
+  typename DstLayout,
+  typename SrcElement,
+  typename SrcLayout,
+  typename F
+>
+struct TensorCopyIf {
+  using DstTensorView = TensorView<DstElement, DstLayout>;
+  using SrcTensorView = TensorView<SrcElement, SrcLayout>;
+  //
+  // Data members
+  //
+  DstTensorView dst;
+  SrcTensorView src;
+  F convert;
+  //
+  // Methods
+  //
+  TensorCopyIf() { }
+  TensorCopyIf(
+    DstTensorView const &dst_,
+    SrcTensorView const &src_,
+    F const &convert_): dst(dst_), src(src_), convert(convert_) {}
+  /// Copies based on destination and source bounds
+  void operator()(Coord<DstLayout::kRank> const &coord) {
+    if (dst.contains(coord) && src.contains(coord)) {
+      dst.at(coord) = convert(src.at(coord));
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies elements from one tensor view into another, satisfying bounds of each tensor.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout,           /// Source tensor's layout
+  typename F                    /// Transformation functor
+>
+void TensorCopy(
+  TensorView<DstElement, DstLayout> dst,
+  TensorView<SrcElement, SrcLayout> src,
+  F const &transform) {
+  using CopyIf = detail::TensorCopyIf<
+    DstElement,
+    DstLayout,
+    SrcElement,
+    SrcLayout,
+    F>;
+  CopyIf copy_if(dst, src, transform);
+  TensorForEach(dst.extent(), copy_if);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies elements from a TensorRef into a TensorView. Assumes source tensor has sufficient extent
+/// to avoid out of bounds accesses.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout,           /// Source tensor's layout
+  typename F                    /// Transformation functor
+>
+void TensorCopy(
+  TensorView<DstElement, DstLayout> dst,
+  TensorRef<SrcElement, SrcLayout> src,
+  F const &transform) {
+  using CopyIf = detail::TensorCopyIf<
+    DstElement,
+    DstLayout,
+    SrcElement,
+    SrcLayout,
+    F>;
+  TensorView<SrcElement, SrcLayout> src_view(src, dst.extent());
+  CopyIf copy_if(dst, src_view, transform);
+  TensorForEach(dst.extent(), copy_if);
+}
+/// Copies elements from a TensorRef into a TensorView. Assumes source tensor has sufficient extent
+/// to avoid out of bounds accesses.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout,           /// Source tensor's layout
+  typename F                    /// Transformation functor
+>
+void TensorCopy(
+  TensorRef<DstElement, DstLayout> dst,
+  TensorView<SrcElement, SrcLayout> src,
+  F const &transform) {
+  using CopyIf = detail::TensorCopyIf<
+    DstElement,
+    DstLayout,
+    SrcElement,
+    SrcLayout,
+    F>;
+  TensorView<DstElement, DstLayout> dst_view(dst, src.extent());
+  CopyIf copy_if(dst_view, src, transform);
+  TensorForEach(src.extent(), copy_if);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
+/// if SrcElement can be converted to DstElement.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout            /// Source tensor's layout
+>
+void TensorCopy(
+  TensorView<DstElement, DstLayout> dst,
+  TensorView<SrcElement, SrcLayout> src) {
+  detail::TrivialConvert<DstElement, SrcElement> convert;
+  TensorCopy(dst, src, convert);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
+/// if SrcElement can be converted to DstElement.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout,           /// Source tensor's layout
+  typename F                    /// Transformation functor
+>
+void TensorCopy(
+  TensorView<DstElement, DstLayout> dst,
+  TensorRef<SrcElement, SrcLayout> src) {
+  detail::TrivialConvert<DstElement, SrcElement> convert;
+  TensorCopy(dst, src, convert);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
+/// if SrcElement can be converted to DstElement.
+template <
+  typename DstElement,          /// Destination tensor's element type
+  typename DstLayout,           /// Destination tensor's layout
+  typename SrcElement,          /// Source tensor's element type
+  typename SrcLayout            /// Source tensor's layout
+>
+void TensorCopy(
+  TensorRef<DstElement, DstLayout> dst,
+  TensorView<SrcElement, SrcLayout> src) {
+  detail::TrivialConvert<DstElement, SrcElement> convert;
+  TensorCopy(dst, src, convert);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h ADDED Viewed

	@@ -0,0 +1,341 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines host-side elementwise operations on TensorView.
+*/
+#pragma once
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "tensor_foreach.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper to apply a binary operator in place
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementD,
+  typename LayoutD,
+  typename BinaryFunc>
+struct TensorFuncBinaryOp {
+  //
+  // Data members
+  //
+  /// View of left-hand-side tensor
+  TensorView<ElementD, LayoutD> view_d;
+  TensorRef<ElementA, LayoutA> view_a;
+  TensorRef<ElementB, LayoutB> view_b;
+  BinaryFunc func;
+  //
+  // Methods
+  //
+  /// Constructor
+  TensorFuncBinaryOp() { }
+  /// Constructor
+  TensorFuncBinaryOp(
+    TensorView<ElementD, LayoutD> const & view_d_,
+    TensorRef<ElementA, LayoutA> const & view_a_,
+    TensorRef<ElementB, LayoutB> const & view_b_,
+    BinaryFunc func = BinaryFunc()
+  ):
+    view_d(view_d_), view_a(view_a_), view_b(view_b_), func(func) { }
+  /// Equality check
+  void operator()(Coord<LayoutD::kRank> const &coord) const {
+    view_d.at(coord) = func(
+      ElementD(view_a.at(coord)),
+      ElementD(view_b.at(coord))
+    );
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Adds two tensors and stores in the destination tensor: d = a + b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorAdd(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+) {
+  detail::TensorFuncBinaryOp<
+    ElementD,
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::plus<ElementD>
+  > func(d, a, b);
+  TensorForEach(
+    d.extent(),
+    func);
+}
+/// Adds a tensor in place: d = d .+ a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA
+>
+void TensorAdd(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+) {
+  TensorAdd(d, d, a);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Subtracts two tensors and stores in the destination tensor: d = a - b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorSub(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+  ) {
+  detail::TensorFuncBinaryOp<
+    ElementD,
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::minus<ElementD>
+  > func(d, a, b);
+  TensorForEach(
+    d.extent(),
+    func);
+}
+/// Subtracts two tensors in place: d = d .- a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorSub(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+  ) {
+  TensorSub(d, d, a);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Multiplies two tensors and stores in the destination tensor: d = a .* b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorMul(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+) {
+  detail::TensorFuncBinaryOp<
+    ElementD,
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::multiplies<ElementD>
+  > func(d, a, b);
+  TensorForEach(
+    d.extent(),
+    func);
+}
+/// Multiplies tensors in place: d = d .* a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA
+>
+void TensorMul(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+) {
+  TensorMul(d, d, a);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Divides two tensors and stores in the destination tensor: d = a ./ b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorDiv(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+) {
+  detail::TensorFuncBinaryOp<
+    ElementD,
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::divides<ElementD>
+  > func(d, a, b);
+  TensorForEach(
+    d.extent(),
+    func);
+}
+/// Divides tensors in place: d = d ./ a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA
+>
+void TensorDiv(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+) {
+  TensorDiv(d, d, a);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Divides two tensors and stores in the destination tensor: d = a ./ b
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB
+>
+void TensorModulus(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
+  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
+) {
+  detail::TensorFuncBinaryOp<
+    ElementD,
+    LayoutD,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    cutlass::divides<ElementD>
+  > func(d, a, b);
+  TensorForEach(
+    d.extent(),
+    func);
+}
+/// Divides tensors in place: d = d ./ a
+template <
+  typename ElementD,
+  typename LayoutD,
+  typename ElementA,
+  typename LayoutA
+>
+void TensorModulus(
+  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
+  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
+) {
+  TensorDiv(d, d, a);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_fill.h ADDED Viewed

	@@ -0,0 +1,1718 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+#pragma once
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <random>
+#include <stdexcept>
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/subbyte_reference.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/tensor_view_planar_complex.h"
+#include "cutlass/blas3.h"
+#include "cutlass/util/distribution.h"
+#include "tensor_foreach.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  Element value;
+  //
+  // Methods
+  //
+  TensorFillFunc(
+    TensorView const &view_ = TensorView(),
+    Element value_ = Element(0)
+  ): view(view_), value(value_) { }
+  void operator()(Coord<Layout::kRank> const & coord) const {
+    view.at(coord) = value;
+  }
+};
+/// Returns a pair of values of the Gaussian distribution generated by the Box Muller method
+struct BoxMullerFunc {
+  BoxMullerFunc() {}
+  void operator()(
+    double* rnd,                     ///< Size-2 vector to be filled with random values
+    double  mean = 0,                ///< Mean of the Gaussian distribution
+    double  stddev = 1,              ///< Standard deviation of the Gaussian distribution
+    double  pi = std::acos(-1)) const {
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+    rnd[0] = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+    rnd[1] = std::sqrt(-2 * std::log(u1)) * std::sin(2 * pi * u2);
+    rnd[0] = mean + stddev * rnd[0];
+    rnd[1] = mean + stddev * rnd[1];
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with a uniform value
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFill(
+  TensorView<Element, Layout> dst,    ///< destination tensor
+  Element val = Element(0)) {               ///< value to uniformly fill it with
+  detail::TensorFillFunc<Element, Layout> func(dst, val);
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+/// Fills a tensor with a uniform value
+template <
+  typename Element,                                                   ///< Element type
+  typename Layout>                                                    ///< Layout function
+void TensorFill(
+  TensorViewPlanarComplex<Element, Layout> dst,                       ///< destination tensor
+  cutlass::complex<Element> val = cutlass::complex<Element>(0)) {     ///< value to uniformly fill it with
+  TensorFill(dst.view_real(), val.real());
+  TensorFill(dst.view_imag(), val.imag());
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <typename Element>
+struct RandomGaussianFunc {
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+  double pnz;
+  bool exclude_zero;
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0,
+    double mean_ = 0,
+    double stddev_ = 1,
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
+      std::srand((unsigned)seed);
+  }
+  /// Compute random value and update RNG state
+  Element operator()() const {
+    // Box-Muller transform to generate random numbers with Normal distribution
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+    // Compute Gaussian random value
+    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+    rnd = mean + stddev * rnd;
+    // Scale and convert final result
+    Element result;
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
+      if (int_scale >= 0) {
+        rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
+        result = static_cast<Element>(rnd);
+      }
+      else {
+        result = static_cast<Element>(rnd);
+      }
+    }
+    else {
+      result = static_cast<Element>(0);
+    }
+    // Note that exclude_zero = true will disable the bernoulli_result above by unsetting zeros
+    if (exclude_zero && result == Element(0)) {
+      if (rnd > 0) {
+        rnd += 1;
+      } else {
+        rnd -= 1;
+      }
+      result = Element(rnd);
+    }
+    return result;
+  }
+};
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomGaussianFunc<complex<Element> > {
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+  double pnz;
+  bool exclude_zero;
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0,
+    double mean_ = 0,
+    double stddev_ = 1,
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
+      std::srand((unsigned)seed);
+  }
+  /// Compute random value and update RNG state
+  complex<Element> operator()() const {
+    Element reals[2];
+    double rnd[2];
+    detail::BoxMullerFunc func;
+    func(rnd, mean, stddev, pi);
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
+      if (int_scale >= 0) {
+        rnd[0] = double(std::llround(rnd[0] * double(1 << int_scale)));
+        rnd[1] = double(std::llround(rnd[1] * double(1 << int_scale)));
+        reals[0] = from_real<Element>(rnd[0] / double(1 << int_scale));
+        reals[1] = from_real<Element>(rnd[1] / double(1 << int_scale));
+      }
+      else {
+        reals[0] = from_real<Element>(rnd[0]);
+        reals[1] = from_real<Element>(rnd[1]);
+      }
+    }
+    else {
+      reals[0] = from_real<Element>(0);
+      reals[1] = from_real<Element>(0);
+    }
+    // Note that this will invalidate the above else statement because it unsets zero elements
+    if (exclude_zero &&
+        reals[0] == from_real<Element>(0.0) &&
+        reals[1] == from_real<Element>(0.0)) {
+      if (rnd[0] > 0.0) {
+        rnd[0] += 1.0;
+      } else {
+        rnd[0] -= 1.0;
+      }
+      reals[0] = from_real<Element>(rnd[0]);
+    }
+    return complex<Element>(reals[0], reals[1]);
+  }
+};
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomGaussianFunc<Quaternion<Element> > {
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+  double pnz;
+  bool exclude_zero;
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0,
+    double mean_ = 0,
+    double stddev_ = 1,
+    int int_scale_ = -1,
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
+      std::srand((unsigned)seed);
+  }
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() const {
+    Element reals[4];
+    double rnd1[2];
+    double rnd2[2];
+    detail::BoxMullerFunc func;
+    func(rnd1, mean, stddev, pi);
+    func(rnd2, mean, stddev, pi);
+    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
+    std::random_device rnd_device;
+    std::mt19937 bernoulli_rnd(rnd_device());
+    std::bernoulli_distribution bernoulli_dist(pnz);
+    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
+    // Sample from the Gaussian distribution for a nonzero element
+    if (bernoulli_result) {
+      if (int_scale >= 0) {
+        rnd1[0] = double(std::llround(rnd1[0] * double(1 << int_scale)));
+        rnd1[1] = double(std::llround(rnd1[1] * double(1 << int_scale)));
+        rnd2[0] = double(std::llround(rnd2[0] * double(1 << int_scale)));
+        rnd2[1] = double(std::llround(rnd2[1] * double(1 << int_scale)));
+        reals[0] = from_real<Element>(rnd1[0] / double(1 << int_scale));
+        reals[1] = from_real<Element>(rnd1[1] / double(1 << int_scale));
+        reals[2] = from_real<Element>(rnd2[0] / double(1 << int_scale));
+        reals[3] = from_real<Element>(rnd2[1] / double(1 << int_scale));
+      }
+      else {
+        reals[0] = from_real<Element>(rnd1[0]);
+        reals[1] = from_real<Element>(rnd1[1]);
+        reals[2] = from_real<Element>(rnd2[0]);
+        reals[3] = from_real<Element>(rnd2[1]);
+      }
+    }
+    else {
+      reals[0] = from_real<Element>(0);
+      reals[1] = from_real<Element>(0);
+      reals[2] = from_real<Element>(0);
+      reals[3] = from_real<Element>(0);
+    }
+    // Note that this will invalidate the above else statement because it unsets zero elements
+    if (exclude_zero &&
+        reals[0] == from_real<Element>(0) &&
+        reals[1] == from_real<Element>(0) &&
+        reals[2] == from_real<Element>(0) &&
+        reals[3] == from_real<Element>(0)) {
+      if (rnd1[0] > 0.0) {
+        rnd1[0] += 1.0;
+      } else {
+        rnd1[0] -= 1.0;
+      }
+      reals[0] = from_real<Element>(rnd1[0]);
+    }
+    return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillGaussianFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  RandomGaussianFunc<Element> func;
+  //
+  // Methods
+  //
+  /// Construction of Gaussian RNG functor.
+  TensorFillGaussianFunc(
+    TensorView view_ = TensorView(),
+    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>()
+  ):
+    view(view_), func(func_) {
+  }
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+    view.at(coord) = func();
+  }
+};
+/// Computes a random Gaussian distribution for a rank-2 tensor
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillSymmetricGaussianFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  RandomGaussianFunc<Element> func;
+  cutlass::FillMode fill_mode;
+  //
+  // Methods
+  //
+  /// Construction of Gaussian RNG functor.
+  TensorFillSymmetricGaussianFunc(
+    TensorView view_ = TensorView(),
+    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_) {
+  }
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 &&
+        fill_mode == cutlass::FillMode::kLower &&
+        coord[0] >= coord[1]) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 &&
+        fill_mode == cutlass::FillMode::kUpper &&
+        coord[0] <= coord[1]) {
+      view.at(coord) = func();
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomGaussian(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+  double pnz = 1.0,                     ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  bool exclude_zero = false) {            ///< Exclude zeros from tensor init.
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz, exclude_zero);
+  detail::TensorFillGaussianFunc<Element, Layout> func(
+    dst,
+    random_func
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomGaussian(
+  TensorViewPlanarComplex<Element, Layout> dst,         ///< destination tensor
+  uint64_t seed,                                        ///< seed for RNG
+  double mean = 0,                                      ///< Gaussian distribution's mean
+  double stddev = 1,                                    ///< Gaussian distribution's standard deviation
+  int bits = -1,                                        ///< If non-negative, specifies number of fractional bits that
+  double pnz = 1.0,                                   ///  are not truncated to zero. Permits reducing precision of
+                                                        ///  data.
+  bool exclude_zero = false) {                          ///< Exclude zeros from tensor init.
+  TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits, pnz);
+  TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits, pnz);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillSymmetricRandomGaussian(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
+  detail::TensorFillSymmetricGaussianFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values of a Gaussian distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomGaussian(
+  Element *ptr,                           ///< destination buffer
+  size_t capacity,                        ///< number of elements
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
+  for (size_t i = 0; i < capacity; ++i) {
+    ReferenceFactory<Element>::get(ptr, i) = random_func();
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <typename Element>
+struct RandomUniformFunc {
+  using Real = typename RealType<Element>::Type;
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+  bool exclude_zero;
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1,
+    double pnan_ = 0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
+    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
+    , bernoulli_dist(pnan_)
+    , exclude_zero(exclude_zero_)
+    {
+      std::srand((unsigned)seed);
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero) {
+        min = (min == 0.0) ? min + 1: min;
+        range = (max == 0.0) ? range - 1: range;
+      }
+  }
+  /// Compute random value and update RNG state
+  Element operator()() {
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
+    double rnd = double(std::rand()) / double(RAND_MAX);
+    rnd = min + range * rnd;
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+    if (int_scale >= 0) {
+      rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      result = static_cast<Element>(Real(rnd));
+    }
+    else {
+      result = static_cast<Element>(Real(rnd));
+    }
+    if (exclude_zero && result == Element(0)) {
+      if (rnd > 0.0) {
+        rnd = std::min(min + range, rnd + 1.0);
+      } else {
+        rnd = std::max(min, rnd - 1.0);
+      }
+      result = static_cast<Element>(Real(rnd));
+    }
+    return result;
+  }
+};
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomUniformFunc<complex<Element> > {
+  using Real = typename RealType<Element>::Type;
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+  bool exclude_zero;
+  //
+  // Methods
+  //
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1,
+    double pnan_ = 0,
+    bool exclude_zero_ = false
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
+    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
+    , bernoulli_dist(pnan_)
+    , exclude_zero(exclude_zero_) {
+      std::srand((unsigned)seed);
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero) {
+        min = (min == 0.0) ? min + 1: min;
+        range = (max == 0.0) ? range - 1: range;
+      }
+  }
+  /// Compute random value and update RNG state
+  complex<Element> operator()() {
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
+    Element reals[2];
+    for (int i = 0; i < 2; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+      rnd = min + range * rnd;
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      if (int_scale >= 0) {
+        rnd = double(std::llround(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+      if (exclude_zero &&
+          i == 0 &&
+          reals[0] == from_real<Element>(0.0)) {
+        if (rnd > 0.0) {
+          rnd = std::min(min + range, rnd + 1.0);
+        } else {
+          rnd = std::max(min, rnd - 1.0);
+        }
+        reals[0] = from_real<Element>(Real(rnd));
+      }
+    }
+    return complex<Element>(reals[0], reals[1]);
+  }
+};
+/// Partial specialization for initializing a Quaternion value.
+template <typename Element>
+struct RandomUniformFunc<Quaternion<Element> > {
+  using Real = typename RealType<Element>::Type;
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+  //
+  // Methods
+  //
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1,
+    double pnan_ = 0
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_),
+    bernoulli_rnd{static_cast<engine_type::result_type>(seed_)},
+    bernoulli_dist(pnan_)
+  {
+    std::srand((unsigned)seed);
+  }
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() {
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }
+    Element reals[4];
+    for (int i = 0; i < 4; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+      rnd = min + range * rnd;
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      if (int_scale >= 0) {
+        rnd = double(std::llround(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+/// Computes a random uniform distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomUniformFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  RandomUniformFunc<Element> func;
+  //
+  // Methods
+  //
+  /// Construction of uniform RNG functor.
+  TensorFillRandomUniformFunc(
+    TensorView view_ = TensorView(),
+    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>()
+  ):
+    view(view_), func(func_) {
+  }
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) {
+    view.at(coord) = func();
+  }
+};
+/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a uniform distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillSymmetricRandomUniformFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  RandomUniformFunc<Element> func;
+  cutlass::FillMode fill_mode;
+  //
+  // Methods
+  //
+  /// Construction of uniform RNG functor.
+  TensorFillSymmetricRandomUniformFunc(
+    TensorView view_ = TensorView(),
+    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_) {
+  }
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 &&
+        fill_mode == cutlass::FillMode::kLower &&
+        coord[0] >= coord[1]) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 &&
+        fill_mode == cutlass::FillMode::kUpper &&
+        coord[0] <= coord[1]) {
+      view.at(coord) = func();
+    }
+  }
+};
+/// Computes a random Uniform distribution and pads diagonal with zeros
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillPadDiagonalRandomUniformFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  RandomUniformFunc<Element> func;
+  cutlass::FillMode fill_mode;
+  int alignment;
+  //
+  // Methods
+  //
+  /// Construction of uniform RNG functor.
+  TensorFillPadDiagonalRandomUniformFunc(
+    TensorView view_ = TensorView(),
+    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid,
+    int alignment_ = 1
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_), alignment(alignment_) {
+  }
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 &&
+        (fill_mode == cutlass::FillMode::kLower) &&
+        (coord[0] >= coord[1]) ||
+        ((coord[1] - coord[0]) >= alignment)) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 &&
+        fill_mode == cutlass::FillMode::kUpper &&
+        (coord[0] <= coord[1]) ||
+        ((coord[0] - coord[1]) >= alignment)) {
+      view.at(coord) = func();
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values of a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  bool exclude_zero = false) {            ///< Exclude zero from tensor init
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan, exclude_zero);
+  detail::TensorFillRandomUniformFunc<Element, Layout> func(
+    dst,
+    random_func
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+/// Fills a tensor with random values of a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorViewPlanarComplex<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                                       ///< seed for RNG
+  double max = 1,                                      ///< upper bound of distribution
+  double min = 0,                                      ///< lower bound for distribution
+  int bits = -1,                                       ///< If non-negative, specifies number of fractional bits that
+                                                       ///  are not truncated to zero. Permits reducing precision of
+                                                       ///  data.
+  double pnan = 0,                                     ///< Percentage of NaN elements.
+  bool exclude_zero = false) {                         ///< Exclude zero from tensor init
+  TensorFillRandomUniform(dst.view_real(), seed, max, min, bits, pnan, exclude_zero);
+  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits, pnan, exclude_zero);
+}
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandomUniform(
+  TensorView<Quaternion<Element>, Layout> dst,        ///< destination tensor
+  uint64_t seed,                                      ///< seed for RNG
+  double max = 1,                                     ///< upper bound of distribution
+  double min = 0,                                     ///< lower bound for distribution
+  int bits = -1) {                                    ///< If non-negative, specifies number of fractional bits that
+                                                      ///  are not truncated to zero. Permits reducing precision of
+                                                      ///  data.
+  detail::RandomUniformFunc<Quaternion<Element>> random_func(seed, max, min, bits);
+  detail::TensorFillRandomUniformFunc<Quaternion<Element>, Layout> func(
+    dst,
+    random_func
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillSymmetricRandomUniform(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+  detail::TensorFillSymmetricRandomUniformFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+/// Fills a tensor with random values with a uniform random distribution pads zeros along diagonal
+template <
+  typename Element,                       ///< Element type
+  typename Layout>                        ///< Layout function
+void TensorFillPadDiagonalRandomUniform(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  int alignment = 1
+) {
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+  detail::TensorFillPadDiagonalRandomUniformFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode,
+    alignment
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with a uniform value
+template <
+  typename Element                        ///< Element type
+>
+void BlockFill(
+  Element *ptr,
+  size_t capacity,
+  Element val
+  ) {
+  for (size_t i = 0; i < capacity; ++i) {
+    ReferenceFactory<Element>::get(ptr, i) = val;
+  }
+}
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomUniform(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  double pnan = 0) {                      ///< Percentage of NaN elements.
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan);
+  for (size_t i = 0; i < capacity; ++i) {
+    ReferenceFactory<Element>::get(ptr, i) = random_func();
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillDiagonalFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  Element diag;
+  Element other;
+  //
+  // Methods
+  //
+  TensorFillDiagonalFunc(
+    TensorView const &view_ = TensorView(),
+    Element diag_ = Element(1),
+    Element other_ = Element(0)
+  ):
+    view(view_), diag(diag_), other(other_) { }
+  void operator()(Coord<Layout::kRank> const & coord) const {
+    bool is_diag = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+    view.at(coord) = (is_diag ? diag : other);
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor everywhere with a unique value for its diagonal.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillDiagonal(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  Element diag = Element(1),              ///< value to write in the diagonal
+  Element other = Element(0)) {           ///< value to write off the diagonal
+  detail::TensorFillDiagonalFunc<Element, Layout> func(
+    dst,
+    diag,
+    other
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper to fill a tensor's diagonal with 1 and 0 everywhere else.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillIdentity(
+  TensorView<Element, Layout> dst) {               ///< destination tensor
+  TensorFillDiagonal(dst, Element(1), Element(0));
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorUpdateDiagonal(
+  TensorView<Element, Layout> dst,                 ///< destination tensor
+  Element val = Element(1)) {
+  typename Layout::Index extent = dst.extent().min();
+  for (typename Layout::Index i = 0; i < extent; ++i) {
+    Coord<Layout::kRank> coord(i);
+    dst.at(coord) = val;
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorUpdateOffDiagonalFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  Element other;
+  //
+  // Methods
+  //
+  TensorUpdateOffDiagonalFunc(
+    TensorView const &view_ = TensorView(),
+    Element other_ = Element(0)
+  ):
+    view(view_), other(other_) { }
+  void operator()(Coord<Layout::kRank> const & coord) const {
+    bool is_diag = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < Layout::kRank; ++i) {
+      if (coord[i] != coord[i - 1]) {
+        is_diag = false;
+        break;
+      }
+    }
+    if (!is_diag) {
+      view.at(coord) = other;
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorUpdateOffDiagonal(
+  TensorView<Element, Layout> dst,      ///< destination tensor
+  Element other = Element(1)) {
+  detail::TensorUpdateOffDiagonalFunc<Element, Layout> func(
+    dst,
+    other
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillLinearFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  Array<Element, Layout::kRank> v;
+  Element s;
+  //
+  // Methods
+  //
+  TensorFillLinearFunc() { }
+  /// Constructs functor
+  TensorFillLinearFunc(
+    TensorView const &view_,
+    Array<Element, Layout::kRank> const & v_,
+    Element s_ = Element(0)
+  ):
+    view(view_), v(v_), s(s_) { }
+  /// Updates the tensor
+  void operator()(Coord<Layout::kRank> const & coord) const {
+    Element sum(s);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank; ++i) {
+      sum += Element(coord[i]) * v[i];
+    }
+    view.at(coord) = sum;
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills tensor with a linear combination of its coordinate and another vector
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillLinear(
+  TensorView<Element, Layout> dst,      ///< destination tensor
+  Array<Element, Layout::kRank> const & v,
+  Element s = Element(0)) {
+  detail::TensorFillLinearFunc<Element, Layout> func(
+    dst,
+    v,
+    s
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills tensor with a linear combination of its coordinate and another vector
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillSequential(
+  TensorView<Element, Layout> dst,     ///< destination tensor
+  Element s = Element(0)) {
+  Array<Element, Layout::kRank> stride;
+  stride[0] = Element(1);
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 1; i < Layout::kRank; ++i) {
+    stride[i] = stride[i - 1] * Element(dst.extent()[i - 1]);
+  }
+  TensorFillLinear(dst, stride, s);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values from a distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillRandom(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  uint64_t seed,
+  Distribution dist,
+  bool exclude_zero = false               ///< If true, excludes 0.
+                                          ///  Note that setting this flag will result in more 1's,
+                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
+) {
+  using Real = typename RealType<Element>::Type;
+  if (dist.kind == Distribution::Gaussian) {
+    TensorFillRandomGaussian(
+      view,
+      seed,
+      dist.gaussian.mean,
+      dist.gaussian.stddev,
+      dist.int_scale,
+      dist.gaussian.pnz,
+      exclude_zero);
+  } else if (dist.kind == Distribution::Uniform) {
+    TensorFillRandomUniform(
+      view,
+      seed,
+      dist.uniform.max,
+      dist.uniform.min,
+      dist.int_scale,
+      dist.uniform.pnan,
+      exclude_zero);
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequential(
+  Element *ptr,
+  int64_t capacity,
+  Element v = Element(1),
+  Element s = Element(0)) {
+  int i = 0;
+  while (i < capacity) {
+    cutlass::ReferenceFactory<Element, (cutlass::sizeof_bits<Element>::value <
+                                        8)>::get(ptr, i) = s;
+    s = Element(s + v);
+    ++i;
+  }
+}
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequentialModN(
+  Element *ptr,
+  int64_t capacity,
+  int64_t mod,
+  int64_t v = int64_t(1),
+  int64_t s = int64_t(0)) {
+  int i = 0;
+  while (i < capacity) {
+    cutlass::ReferenceFactory<Element, (cutlass::sizeof_bits<Element>::value <
+                                        8)>::get(ptr, i) = Element(s);
+    s = int64_t(s + v) % mod;
+    ++i;
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillRandom(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,
+  Distribution dist) {
+  if (dist.kind == Distribution::Gaussian) {
+    BlockFillRandomGaussian<Element>(
+      ptr,
+      capacity,
+      seed,
+      dist.gaussian.mean,
+      dist.gaussian.stddev,
+      dist.int_scale,
+      dist.gaussian.pnz);
+  }
+  else if (dist.kind == Distribution::Uniform) {
+    BlockFillRandomUniform<Element>(
+      ptr,
+      capacity,
+      seed,
+      dist.uniform.max,
+      dist.uniform.min,
+      dist.int_scale,
+      dist.uniform.pnan);
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <typename Element>
+struct RandomSparseMetaFunc {
+  uint64_t seed;
+  int range;
+  int MetaSizeInBits;
+  //
+  // Methods
+  //
+  RandomSparseMetaFunc(
+    uint64_t seed_ = 0,
+    int MetaSizeInBits_ = 2
+  ):
+    seed(seed_), MetaSizeInBits(MetaSizeInBits_) {
+      std::srand((unsigned)seed);
+      if (MetaSizeInBits_ == 2) {
+        range = 6;
+      }
+      else if (MetaSizeInBits_ == 4) {
+        range = 2;
+      }
+      else {
+        throw std::invalid_argument("Invalid MetaSizeInBits");
+      }
+    }
+  /// Compute random value and update RNG state
+  Element operator()() const {
+    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
+    Element TwoToOneMeta[2] = {0x4, 0xe};
+    Element * MetaArray = (MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
+    Element result = 0x0;
+    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
+      int rnd = std::rand() % range;
+      Element meta = MetaArray[rnd];
+      result = (Element)(result | ((Element)(meta << (i * 4))));
+    }
+    return result;
+  }
+};
+/// Computes a random sparse meta
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillRandomSparseMetaFunc {
+  using TensorView = TensorView<Element, Layout>;
+  //
+  // Data members
+  //
+  TensorView view;
+  RandomSparseMetaFunc<Element> func;
+  //
+  // Methods
+  //
+  /// Construction of Gaussian RNG functor.
+  TensorFillRandomSparseMetaFunc(
+    TensorView view_ = TensorView(),
+    RandomSparseMetaFunc<Element> func_ = RandomSparseMetaFunc<Element>()
+  ):
+    view(view_), func(func_) {
+  }
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+    view.at(coord) = func();
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,                 ///< Element type
+  typename Layout>                  ///< Layout function
+void TensorFillRandomSparseMeta(
+  TensorView<Element, Layout> dst,  ///< destination tensor
+  uint64_t seed,                    ///< seed for RNG
+  int MetaSizeInBits) {             ///< 2 bit or 4 bit
+  detail::RandomSparseMetaFunc<Element> random_func(seed, MetaSizeInBits);
+  detail::TensorFillRandomSparseMetaFunc<Element, Layout> func(
+    dst,
+    random_func
+  );
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomSparseMeta(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  int MetaSizeInBits) {                   ///< 2 bit or 4bit
+  detail::RandomSparseMetaFunc<Element> random_func(seed, MetaSizeInBits);
+  for (size_t i = 0; i < capacity; ++i) {
+    ptr[i] = random_func();
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a ell block index matrix with random values with a uniform random distribution.
+template <
+  typename Element,                                ///< Element type
+  typename Layout>                                 ///< Layout function
+void TensorFillRandomEllIdx(
+  TensorView<Element, Layout> dst,                 ///< destination tensor
+  uint64_t seed,                                   ///< seed for RNG
+  int rows, int ell_cols, int cols) {              ///< dimension of the matrix
+  std::srand((unsigned)seed);
+  for (int i = 0; i < rows; ++i) {
+    int col_idx = std::rand() % cols;
+    for (int j = 0; j < ell_cols; ++j) {
+      dst.at({i, j}) = col_idx;
+      if (col_idx != -1) {
+        if (col_idx == (cols - 1)) {
+          col_idx = -1;
+        } else {
+          col_idx = std::rand() % (cols - col_idx - 1) + col_idx + 1;
+        }
+      }
+    }
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies a diagonal in from host memory without modifying off-diagonal elements.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorCopyDiagonalIn(
+  TensorView<Element, Layout> dst,          ///< destination tensor
+  Element const *ptr) {                     ///< dense buffer of elements
+  typename Layout::Index extent = dst.extent().min();
+  for (typename Layout::Index i = 0; i < extent; ++i) {
+    Coord<Layout::kRank> coord(i);
+    dst.at(coord) = ReferenceFactory<Element>::get(ptr, i);
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Copies the diagonal of a tensor into a dense buffer in host memory.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorCopyDiagonalOut(
+  Element *ptr,                               ///< dense buffer of elements
+  TensorView<Element, Layout> src) {          ///< source tensor
+  typename Layout::Index extent = src.extent().min();
+  for (typename Layout::Index i = 0; i < extent; ++i) {
+    Coord<Layout::kRank> coord(i);
+    ReferenceFactory<Element>::get(ptr, i) = src.at(coord);
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp ADDED Viewed

	@@ -0,0 +1,432 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+#pragma once
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+// Cute includes
+#include "cute/tensor.hpp"
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Uniform and procedural tensor fills
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with a scalar element
+template <typename Tensor>
+void TensorFill(Tensor dst, typename Tensor::value_type element) {
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = element;
+  }
+}
+/// Fills a tensor with the contents of its layout
+template <typename Tensor>
+void TensorFillSequential(Tensor dst) {
+  auto layout = dst.layout();
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = layout(idx);
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Random uniform values
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <typename Element>
+struct RandomUniformFunc {
+  using Real = typename RealType<Element>::Type;
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+  //
+  // Methods
+  //
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+  /// Compute random value and update RNG state
+  Element operator()() const {
+    double rnd = double(std::rand()) / double(RAND_MAX);
+    rnd = min + range * rnd;
+    // Random values are cast to integer after scaling by a power of two to facilitate error
+    // testing
+    Element result;
+    if (int_scale >= 0) {
+      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      result = static_cast<Element>(Real(rnd));
+    }
+    else {
+      result = static_cast<Element>(Real(rnd));
+    }
+    return result;
+  }
+};
+/// Partial specialization for initializing a complex value.
+template <typename Element>
+struct RandomUniformFunc<complex<Element> > {
+  using Real = typename RealType<Element>::Type;
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+  //
+  // Methods
+  //
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+  /// Compute random value and update RNG state
+  complex<Element> operator()() const {
+    Element reals[2];
+    for (int i = 0; i < 2; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+      rnd = min + range * rnd;
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+    return complex<Element>(reals[0], reals[1]);
+  }
+};
+/// Partial specialization for initializing a Quaternion value.
+template <typename Element>
+struct RandomUniformFunc<Quaternion<Element> > {
+  using Real = typename RealType<Element>::Type;
+  uint64_t seed;
+  double range;
+  double min;
+  int int_scale;
+  //
+  // Methods
+  //
+  RandomUniformFunc(
+    uint64_t seed_ = 0,
+    double max = 1,
+    double min_ = 0,
+    int int_scale_ = -1
+  ):
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+      std::srand((unsigned)seed);
+    }
+  /// Compute random value and update RNG state
+  Quaternion<Element> operator()() const {
+    Element reals[4];
+    for (int i = 0; i < 4; ++i) {
+      double rnd = double(std::rand()) / double(RAND_MAX);
+      rnd = min + range * rnd;
+      // Random values are cast to integer after scaling by a power of two to facilitate error
+      // testing
+      if (int_scale >= 0) {
+        rnd = double(int(rnd * double(1 << int_scale)));
+        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
+      }
+      else {
+        reals[i] = from_real<Element>(Real(rnd));
+      }
+    }
+    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a uniform random distribution.
+template <typename Tensor>                ///< Tensor object
+void TensorFillRandomUniform(
+  Tensor dst,                             ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  detail::RandomUniformFunc<typename Tensor::value_type> random_func(seed, max, min, bits);
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = random_func();
+  }
+}
+/// Fills a block with random values with a uniform random distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomUniform(
+  Element *ptr,
+  size_t capacity,
+  uint64_t seed,                          ///< seed for RNG
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+  for (size_t i = 0; i < capacity; ++i) {
+    ptr[i] = random_func();
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Random Gaussian
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <typename Element>
+struct RandomGaussianFunc {
+  uint64_t seed;
+  double mean;
+  double stddev;
+  int int_scale;
+  double pi;
+  //
+  // Methods
+  //
+  RandomGaussianFunc(
+    uint64_t seed_ = 0,
+    double mean_ = 0,
+    double stddev_ = 1,
+    int int_scale_ = -1
+  ):
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
+      std::srand((unsigned)seed);
+  }
+  /// Compute random value and update RNG state
+  Element operator()() const {
+    // Box-Muller transform to generate random numbers with Normal distribution
+    double u1 = double(std::rand()) / double(RAND_MAX);
+    double u2 = double(std::rand()) / double(RAND_MAX);
+    // Compute Gaussian random value
+    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
+    rnd = mean + stddev * rnd;
+    // Scale and convert final result
+    Element result;
+    if (int_scale >= 0) {
+      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
+      result = static_cast<Element>(rnd);
+    }
+    else {
+      result = static_cast<Element>(rnd);
+    }
+    return result;
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Tensor
+>
+void TensorFillRandomGaussian(
+  Tensor  dst,                            ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  detail::RandomGaussianFunc<typename Tensor::value_type> random_func(seed, mean, stddev, bits);
+  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
+    dst(idx) = random_func();
+  }
+}
+/// Fills a block with random values with a Gaussian distribution.
+template <
+  typename Element                        ///< Element type
+>
+void BlockFillRandomGaussian(
+  Element *ptr,                           ///< destination buffer
+  size_t capacity,                        ///< number of elements
+  uint64_t seed,                          ///< seed for RNG
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
+  for (size_t i = 0; i < capacity; ++i) {
+    ptr[i] = random_func();
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequential(
+  Element *ptr,
+  int64_t capacity,
+  Element v = Element(1),
+  Element s = Element(0)) {
+  int i = 0;
+  while (i < capacity) {
+    ptr[i] = Element(s + v);
+    ++i;
+  }
+}
+/// Fills a block of data with sequential elements
+template <
+  typename Element
+>
+void BlockFillSequentialModN(
+  Element *ptr,
+  int64_t capacity,
+  int64_t mod,
+  int64_t v = int64_t(1),
+  int64_t s = int64_t(0)) {
+  int i = 0;
+  while (i < capacity) {
+    ptr[i] = static_cast<Element>(int32_t(int64_t(s + v) % mod));
+    ++i;
+  }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_foreach.h ADDED Viewed

	@@ -0,0 +1,134 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+namespace cutlass  {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines several helpers
+namespace detail {
+/// Helper to perform for-each operation
+template <typename Func, int Rank, int RankRemaining>
+struct TensorForEachHelper {
+  /// Index of the active rank
+  static int const kActiveRank = Rank - RankRemaining - 1;
+  /// Constructor for general rank
+  TensorForEachHelper(
+    Func &func,
+    Coord<Rank> const &extent,
+    Coord<Rank> &coord) {
+    for (int i = 0; i < extent.at(kActiveRank); ++i) {
+      coord[kActiveRank] = i;
+      TensorForEachHelper<Func, Rank, RankRemaining - 1>(func, extent, coord);
+    }
+  }
+};
+/// Helper to perform for-each operation
+template <typename Func, int Rank>
+struct TensorForEachHelper<Func, Rank, 0> {
+  /// Index of the active rank
+  static int const kActiveRank = Rank - 1;
+  /// Constructor for fastest changing rank
+  TensorForEachHelper(
+    Func &func,
+    Coord<Rank> const &extent,
+    Coord<Rank> &coord) {
+    for (int i = 0; i < extent.at(kActiveRank); ++i) {
+      coord[kActiveRank] = i;
+      func(coord);
+    }
+  }
+};
+} // namespace detail
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Iterates over the index space of a tensor
+template <
+  typename Func,          ///< function applied to each point in a tensor's index space
+  int Rank>               ///< rank of index space
+void TensorForEach(Coord<Rank> extent, Func & func) {
+  Coord<Rank> coord;
+  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, extent, coord);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Iterates over the index space of a tensor and calls a C++ lambda
+template <
+  typename Func,          ///< function applied to each point in a tensor's index space
+  int Rank>               ///< rank of index space
+void TensorForEachLambda(Coord<Rank> extent, Func func) {
+  Coord<Rank> coord;
+  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, extent, coord);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element, typename Func>
+struct BlockForEach {
+  /// Constructor performs the operation.
+  BlockForEach(
+    Element *ptr,
+    size_t capacity,
+    typename Func::Params params = typename Func::Params()) {
+    Func func(params);
+    for (size_t index = 0; index < capacity; ++index) {
+      ptr[index] = func();
+    }
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_norm.h ADDED Viewed

	@@ -0,0 +1,42 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/cutlass.h"
+// The contents of this file have been moved  to 'tensor_reduce' to cover other types of reductions.
+#include "cutlass/util/reference/host/tensor_reduce.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_reduce.h ADDED Viewed

	@@ -0,0 +1,203 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include <cmath>
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/util/reference/detail/linear_to_coordinate.h"
+#include "cutlass/core_io.h"
+namespace cutlass  {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform
+) {
+  for (int64_t idx = 0; idx < int64_t(view.size()); ++idx) {
+    typename Layout::TensorCoord coord;
+    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
+    if (view.contains(coord)) {
+      Element x = view.at(coord);
+      identity = reduce(identity, transform(x));
+    }
+  }
+  return identity;
+}
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform) {
+  if (view_A.extent() != view_B.extent()) {
+    throw std::runtime_error("Tensor extents must match.");
+  }
+  for (int64_t idx = 0; idx < int64_t(view_A.size()); ++idx) {
+    typename Layout::TensorCoord coord;
+    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
+    if (view_A.contains(coord)) {
+      Element a = view_A.at(coord);
+      Element b = view_B.at(coord);
+      identity = reduce(identity, transform(a, b));
+    }
+  }
+  return identity;
+}
+/// Helper to compute the sum of the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = Element
+>
+ComputeType TensorSum(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType()
+) {
+  plus<ComputeType> reduce;
+  NumericConverter<ComputeType, Element> transform;
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+/// Helper to compute the sum of the squares of the elements of a tensor
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = Element
+>
+ComputeType TensorSumSq(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType()
+) {
+  plus<ComputeType> reduce;
+  magnitude_squared<Element, ComputeType> transform;
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+/// Helper to compute the norm of the elements of a tensor.
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorNorm(
+  TensorView<Element, Layout> view,
+  ComputeType identity = ComputeType()
+) {
+  return std::sqrt(TensorSumSq(view, identity));
+}
+/// Helper to compute the sum of the squares of the differences of two tensors
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorSumSqDiff(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity = ComputeType()
+) {
+  plus<ComputeType> reduce;
+  magnitude_squared_difference<Element, ComputeType> transform;
+  return TensorTransformReduce(
+    view_A, view_B, identity, reduce, transform);
+}
+/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
+template <
+  typename Element,
+  typename Layout,
+  typename ComputeType = double
+>
+ComputeType TensorNormDiff(
+  TensorView<Element, Layout> view_A,
+  TensorView<Element, Layout> view_B,
+  ComputeType identity = ComputeType()
+) {
+  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp ADDED Viewed

	@@ -0,0 +1,203 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Provides several functions for filling tensors with data.
+*/
+#pragma once
+// Standard Library includes
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+// Cute includes
+#include "cute/tensor.hpp"
+// Cutlass includes
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reference {
+namespace host {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tensor reductions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename Tensor,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  Tensor view,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform
+) {
+  for (int64_t idx = 0; idx < cute::size(view); ++idx) {
+    identity = reduce(identity, transform(view(idx)));
+  }
+  return identity;
+}
+/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
+/// workspace
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType,
+  typename ReduceOp,
+  typename TransformOp
+>
+ComputeType TensorTransformReduce(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity,
+  ReduceOp reduce,
+  TransformOp transform) {
+  if (cute::size(view_A) != cute::size(view_B)) {
+    throw std::runtime_error("Tensor sizes must match.");
+  }
+  for (int64_t idx = 0; idx < cute::size(view_A); ++idx) {
+    identity = reduce(identity, transform(view_A(idx), view_B(idx)));
+  }
+  return identity;
+}
+/// Helper to compute the sum of the elements of a tensor
+template <
+  typename Tensor,
+  typename ComputeType = typename Tensor::value_type
+>
+ComputeType TensorSum(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+  plus<ComputeType> reduce;
+  NumericConverter<ComputeType, typename Tensor::value_type> transform;
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+/// Helper to compute the sum of the squares of the elements of a tensor
+template <
+  typename Tensor,
+  typename ComputeType = typename Tensor::value_type
+>
+ComputeType TensorSumSq(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+  plus<ComputeType> reduce;
+  magnitude_squared<typename Tensor::value_type, ComputeType> transform;
+  return TensorTransformReduce(
+    view, identity, reduce, transform);
+}
+/// Helper to compute the norm of the elements of a tensor.
+template <
+  typename Tensor,
+  typename ComputeType = double
+>
+ComputeType TensorNorm(
+  Tensor view,
+  ComputeType identity = ComputeType()
+) {
+  return std::sqrt(TensorSumSq(view, identity));
+}
+/// Helper to compute the sum of the squares of the differences of two tensors
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType = double
+>
+ComputeType TensorSumSqDiff(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity = ComputeType()
+) {
+  plus<ComputeType> reduce;
+  magnitude_squared_difference<typename TensorA::value_type, ComputeType> transform;
+  return TensorTransformReduce(
+    view_A, view_B, identity, reduce, transform);
+}
+/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
+template <
+  typename TensorA,
+  typename TensorB,
+  typename ComputeType = double
+>
+ComputeType TensorNormDiff(
+  TensorA view_A,
+  TensorB view_B,
+  ComputeType identity = ComputeType()
+) {
+  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/trmm.h ADDED Viewed

	@@ -0,0 +1,215 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for TRMM in host-side code.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_trmm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
+                , "Fill Mode can either be Lower or Upper.");
+  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+      ComputeType accum[Mblock][Nblock];
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            if (row < M && col < N) {
+              ElementA a = ElementA();
+              ElementB b = ElementB();
+              if (SideModeA == SideMode::kLeft) {
+                a = (compare_op(row, k_block)) ?
+                            (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
+                if (row == k_block && DiagTypeA == DiagType::kUnit) {
+                  a = ElementA(1);
+                }
+                b = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a = tensor_b.at(MatrixCoord(row, k_block));
+                b = (compare_op(k_block, col)) ?
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
+                if (k_block == col && DiagTypeA == DiagType::kUnit) {
+                  b = ElementA(1);
+                }
+              }
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
+            }
+          }
+        }
+      }
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+          MatrixCoord coord = MatrixCoord(row, col);
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]));
+          }
+        }
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Trmm;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, SideMode SideModeA,
+           FillMode FillModeA, DiagType DiagTypeA,
+           typename ElementB, typename LayoutB,
+           typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
+            ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
+                 ElementC, LayoutC, ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/trmm_complex.h ADDED Viewed

	@@ -0,0 +1,262 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued TRMM in host-side code.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/reference/host/gemm.h"
+namespace cutlass {
+namespace reference {
+namespace host {
+/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  ComplexTransform TransformA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  ComplexTransform TransformB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_trmm_complex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
+                , "Fill Mode can either be Lower or Upper.");
+  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+      ComputeType accum[Mblock][Nblock];
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+            if (row < M && col < N) {
+              ElementA a = ElementA();
+              ElementB b = ElementB();
+              if (SideModeA == SideMode::kLeft) {
+                a = (compare_op(row, k_block)) ?
+                              (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
+                if (row == k_block && DiagTypeA == DiagType::kUnit) {
+                  a = ElementA(1);
+                }
+                b = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a = tensor_b.at(MatrixCoord(row, k_block));
+                b = (compare_op(k_block, col)) ?
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
+                if (k_block == col && DiagTypeA == DiagType::kUnit) {
+                  b = ElementA(1);
+                }
+              }
+              ComputeType a_ik = ComputeType(a);
+              ComputeType b_kj = ComputeType(b);
+              // Conjugate, and hence hermitian, is only allowed for the triangular matrix
+              if (SideModeA == SideMode::kLeft && TransformA == ComplexTransform::kConjugate) {
+                a_ik = conj(a_ik);
+              } else if (SideModeA == SideMode::kRight && TransformA == ComplexTransform::kConjugate) {
+                b_kj = conj(b_kj);
+              }
+              accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+            }
+          }
+        }
+      }
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+          MatrixCoord coord = MatrixCoord(row, col);
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]));
+          }
+        }
+      }
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  ComplexTransform TransformA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  ComplexTransform TransformB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
+>
+struct TrmmComplex;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
+          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA,
+          typename ElementB, typename LayoutB, ComplexTransform TransformB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct TrmmComplex<ElementA, LayoutA, TransformA,
+                   SideModeA, FillModeA, DiagTypeA,
+                   ElementB, LayoutB, TransformB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, arch::OpMultiplyAddComplex> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_trmm_complex<ElementA, LayoutA, TransformA,
+                 SideModeA, FillModeA, DiagTypeA,
+                 ElementB, LayoutB, TransformB,
+                 ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for gaussian multiply-add
+template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
+          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA,
+          typename ElementB, typename LayoutB, ComplexTransform TransformB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct TrmmComplex<ElementA, LayoutA, TransformA,
+                   SideModeA, FillModeA, DiagTypeA,
+                   ElementB, LayoutB, TransformB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, arch::OpMultiplyAddGaussianComplex> {
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+    compute_trmm_complex<ElementA, LayoutA, TransformA,
+                 SideModeA, FillModeA, DiagTypeA,
+                 ElementB, LayoutB, TransformB,
+                 ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace host
+} // namespace reference
+} // namespace cutlass