Spaces:
Running
Running
Jeroen Mostert commited on
Commit ·
1d65fea
1
Parent(s): 6519fd2
Allow all RDNA2 archs to use sdot4 intrinsic (llama/8629)
Browse filesThe check gating the use of `__builtin_amdgc_sdot4` specifically checks for gfx1030. This causes a severe perf regression for anything gfx103? that's not gfx1030 and not using `HSA_OVERRIDE_GFX_VERSION` (if you've built ROCm to support it). We already have a generic RDNA2 define, let's use it.
ggml/src/ggml-cuda/common.cuh
CHANGED
|
@@ -459,7 +459,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
|
| 459 |
|
| 460 |
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
| 461 |
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 462 |
-
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(
|
| 463 |
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 464 |
#elif defined(RDNA3)
|
| 465 |
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
|
|
|
| 459 |
|
| 460 |
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
| 461 |
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 462 |
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
| 463 |
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 464 |
#elif defined(RDNA3)
|
| 465 |
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|