diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.26s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
Wed Oct 29 00:36:23 2025 +Wed Oct 29 04:13:37 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ @@ -3896,7 +3896,7 @@ Cell: nv | 0.23s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 31C P0 86W / 350W | 0MiB / 46068MiB | 22% Default | +| N/A 30C P0 116W / 350W | 0MiB / 46068MiB | 67% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3920,7 +3920,7 @@ Cell: nv | 0.23s ▼ output ▶ uv-logs | -Cell: benchmark | 4.48s +Cell: benchmark | 8.00s | Raw @@ -3989,23 +3989,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 417.663us 1766.17% 417.663us 417.663us 1 - hf_kernels_rotary 11.92% 243.797us 99.67% 2.039ms 2.039ms 0.000us 0.00% 24.864us 24.864us 1 - _rotary_dba7d1e::apply_rotary 2.64% 54.054us 5.06% 103.576us 17.263us 16.992us 71.85% 16.992us 2.832us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 71.85% 16.992us 2.832us 6 - aten::clone 2.02% 41.272us 79.82% 1.633ms 272.116us 0.000us 0.00% 7.872us 1.312us 6 - aten::copy_ 1.82% 37.200us 74.94% 1.533ms 255.467us 6.656us 28.15% 7.872us 1.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 28.15% 6.656us 1.109us 6 - Activity Buffer Request 69.47% 1.421ms 69.47% 1.421ms 1.421ms 1.216us 5.14% 1.216us 1.216us 1 - aten::empty_strided 2.87% 58.622us 2.87% 58.622us 9.770us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.65% 74.674us 3.65% 74.674us 12.446us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.25% 46.121us 2.87% 58.631us 4.886us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.61% 12.510us 0.61% 12.510us 1.042us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.42% 49.522us 2.42% 49.522us 8.254us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.33% 6.691us 0.33% 6.691us 6.691us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 430.557us 1823.31% 430.557us 430.557us 1 + hf_kernels_rotary 12.45% 261.077us 99.68% 2.091ms 2.091ms 0.000us 0.00% 24.830us 24.830us 1 + _rotary_dba7d1e::apply_rotary 2.74% 57.471us 5.07% 106.292us 17.715us 16.960us 71.82% 16.960us 2.827us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 71.82% 16.960us 2.827us 6 + aten::clone 2.05% 43.019us 79.24% 1.662ms 276.993us 0.000us 0.00% 7.870us 1.312us 6 + aten::copy_ 2.02% 42.402us 74.32% 1.559ms 259.814us 6.654us 28.18% 7.870us 1.312us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.654us 28.18% 6.654us 1.109us 6 + Activity Buffer Request 68.70% 1.441ms 68.70% 1.441ms 1.441ms 1.216us 5.15% 1.216us 1.216us 1 + aten::empty_strided 2.86% 60.050us 2.86% 60.050us 10.008us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 3.60% 75.463us 3.60% 75.463us 12.577us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.30% 48.161us 2.93% 61.552us 5.129us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.64% 13.391us 0.64% 13.391us 1.116us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.33% 48.821us 2.33% 48.821us 8.137us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.32% 6.611us 0.32% 6.611us 6.611us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.045ms -Self CUDA time total: 23.648us +Self CPU time total: 2.097ms +Self CUDA time total: 23.614us @@ -4015,23 +4015,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 362.684us 1535.76% 362.684us 362.684us 1 - hf_kernels_rotary 9.63% 184.044us 99.76% 1.906ms 1.906ms 0.000us 0.00% 24.736us 24.736us 1 - _rotary_dba7d1e::apply_rotary 2.64% 50.383us 5.03% 96.065us 16.011us 16.864us 71.41% 16.864us 2.811us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.864us 71.41% 16.864us 2.811us 6 - aten::clone 1.50% 28.618us 82.74% 1.581ms 263.486us 0.000us 0.00% 7.872us 1.312us 6 - aten::copy_ 1.95% 37.192us 79.54% 1.520ms 253.297us 6.752us 28.59% 7.872us 1.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 28.59% 6.752us 1.125us 6 - Activity Buffer Request 74.55% 1.424ms 74.55% 1.424ms 1.424ms 1.120us 4.74% 1.120us 1.120us 1 - aten::empty_strided 1.70% 32.513us 1.70% 32.513us 5.419us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 3.05% 58.263us 3.05% 58.263us 9.710us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.86% 35.461us 2.36% 45.051us 3.754us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.50% 9.590us 0.50% 9.590us 0.799us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.39% 45.682us 2.39% 45.682us 7.614us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.600us 0.24% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 354.397us 1494.59% 354.397us 354.397us 1 + hf_kernels_rotary 8.83% 167.284us 99.70% 1.889ms 1.889ms 0.000us 0.00% 24.832us 24.832us 1 + _rotary_dba7d1e::apply_rotary 2.30% 43.483us 4.65% 88.083us 14.681us 16.928us 71.39% 16.928us 2.821us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 71.39% 16.928us 2.821us 6 + aten::clone 1.15% 21.821us 83.90% 1.589ms 264.887us 0.000us 0.00% 7.904us 1.317us 6 + aten::copy_ 1.92% 36.410us 80.96% 1.534ms 255.610us 6.784us 28.61% 7.904us 1.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 28.61% 6.784us 1.131us 6 + Activity Buffer Request 76.05% 1.441ms 76.05% 1.441ms 1.441ms 1.120us 4.72% 1.120us 1.120us 1 + aten::empty_strided 1.79% 33.840us 1.79% 33.840us 5.640us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.99% 56.551us 2.99% 56.551us 9.425us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.80% 34.161us 2.32% 43.981us 3.665us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.52% 9.820us 0.52% 9.820us 0.818us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.35% 44.600us 2.35% 44.600us 7.433us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.30% 5.690us 0.30% 5.690us 5.690us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.911ms -Self CUDA time total: 23.616us +Self CPU time total: 1.894ms +Self CUDA time total: 23.712us @@ -4041,23 +4041,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.889us 1406.66% 352.889us 352.889us 1 - hf_kernels_rotary 9.52% 180.074us 99.73% 1.887ms 1.887ms 0.000us 0.00% 26.399us 26.399us 1 - _rotary_dba7d1e::apply_rotary 2.26% 42.841us 4.55% 86.004us 14.334us 17.248us 68.75% 17.248us 2.875us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.248us 68.75% 17.248us 2.875us 6 - aten::clone 1.50% 28.330us 83.30% 1.576ms 262.706us 0.000us 0.00% 9.151us 1.525us 6 - aten::copy_ 1.91% 36.070us 80.06% 1.515ms 252.487us 7.839us 31.25% 9.151us 1.525us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.839us 31.25% 7.839us 1.307us 6 - Activity Buffer Request 75.19% 1.423ms 75.19% 1.423ms 1.423ms 1.312us 5.23% 1.312us 1.312us 1 - aten::empty_strided 1.74% 32.981us 1.74% 32.981us 5.497us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.97% 56.174us 2.97% 56.174us 9.362us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.86% 35.224us 2.36% 44.742us 3.729us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.50% 9.518us 0.50% 9.518us 0.793us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.28% 43.163us 2.28% 43.163us 7.194us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.27% 5.081us 0.27% 5.081us 5.081us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 351.514us 1403.08% 351.514us 351.514us 1 + hf_kernels_rotary 8.97% 168.905us 99.73% 1.878ms 1.878ms 0.000us 0.00% 26.397us 26.397us 1 + _rotary_dba7d1e::apply_rotary 2.31% 43.581us 4.63% 87.301us 14.550us 17.182us 68.58% 17.182us 2.864us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.182us 68.58% 17.182us 2.864us 6 + aten::clone 1.22% 22.970us 83.89% 1.580ms 263.366us 0.000us 0.00% 9.215us 1.536us 6 + aten::copy_ 1.97% 37.139us 80.89% 1.524ms 253.959us 7.871us 31.42% 9.215us 1.536us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.871us 31.42% 7.871us 1.312us 6 + Activity Buffer Request 76.03% 1.432ms 76.03% 1.432ms 1.432ms 1.344us 5.36% 1.344us 1.344us 1 + aten::empty_strided 1.78% 33.471us 1.78% 33.471us 5.579us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.90% 54.532us 2.90% 54.532us 9.089us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.75% 33.032us 2.23% 42.062us 3.505us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.48% 9.030us 0.48% 9.030us 0.753us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.32% 43.720us 2.32% 43.720us 7.287us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.180us 0.27% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.892ms -Self CUDA time total: 25.087us +Self CPU time total: 1.884ms +Self CUDA time total: 25.053us @@ -4067,22 +4067,22 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.892us 1375.46% 353.892us 353.892us 1 - hf_kernels_rotary 8.61% 178.135us 99.77% 2.063ms 2.063ms 0.000us 0.00% 27.041us 27.041us 1 - _rotary_dba7d1e::apply_rotary 2.02% 41.741us 4.14% 85.532us 14.255us 17.985us 69.90% 17.985us 2.997us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.985us 69.90% 17.985us 2.997us 6 - aten::clone 1.32% 27.361us 84.83% 1.754ms 292.410us 0.000us 0.00% 9.056us 1.509us 6 - aten::copy_ 1.77% 36.582us 81.87% 1.693ms 282.183us 7.744us 30.10% 9.056us 1.509us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 30.10% 7.744us 1.291us 6 - Activity Buffer Request 68.36% 1.414ms 68.36% 1.414ms 1.414ms 1.312us 5.10% 1.312us 1.312us 1 - aten::empty_strided 1.64% 34.001us 1.64% 34.001us 5.667us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.73% 242.678us 11.73% 242.678us 40.446us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.70% 35.202us 2.18% 45.153us 3.763us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.48% 9.951us 0.48% 9.951us 0.829us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.12% 43.791us 2.12% 43.791us 7.299us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.830us 0.23% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.719us 1363.13% 350.719us 350.719us 1 + hf_kernels_rotary 8.14% 166.612us 99.74% 2.042ms 2.042ms 0.000us 0.00% 27.041us 27.041us 1 + _rotary_dba7d1e::apply_rotary 2.16% 44.301us 4.32% 88.521us 14.753us 18.017us 70.03% 18.017us 3.003us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.017us 70.03% 18.017us 3.003us 6 + aten::clone 1.09% 22.263us 85.15% 1.743ms 290.575us 0.000us 0.00% 9.024us 1.504us 6 + aten::copy_ 1.77% 36.339us 82.49% 1.689ms 281.498us 7.712us 29.97% 9.024us 1.504us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 29.97% 7.712us 1.285us 6 + Activity Buffer Request 69.05% 1.414ms 69.05% 1.414ms 1.414ms 1.312us 5.10% 1.312us 1.312us 1 + aten::empty_strided 1.57% 32.200us 1.57% 32.200us 5.367us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 11.67% 238.856us 11.67% 238.856us 39.809us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.67% 34.224us 2.13% 43.713us 3.643us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.46% 9.489us 0.46% 9.489us 0.791us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.16% 44.220us 2.16% 44.220us 7.370us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.270us 0.26% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.068ms +Self CPU time total: 2.048ms Self CUDA time total: 25.729us @@ -4093,23 +4093,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 351.422us 1397.30% 351.422us 351.422us 1 - hf_kernels_rotary 8.84% 180.886us 99.76% 2.041ms 2.041ms 0.000us 0.00% 26.462us 26.462us 1 - _rotary_dba7d1e::apply_rotary 2.10% 42.971us 4.17% 85.245us 14.208us 17.214us 68.45% 17.214us 2.869us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.214us 68.45% 17.214us 2.869us 6 - aten::clone 1.43% 29.360us 84.55% 1.730ms 288.328us 0.000us 0.00% 9.248us 1.541us 6 - aten::copy_ 1.75% 35.821us 81.51% 1.668ms 277.955us 7.936us 31.55% 9.248us 1.541us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 31.55% 7.936us 1.323us 6 - Activity Buffer Request 69.89% 1.430ms 69.89% 1.430ms 1.430ms 1.312us 5.22% 1.312us 1.312us 1 - aten::empty_strided 1.61% 32.881us 1.61% 32.881us 5.480us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.87% 201.958us 9.87% 201.958us 33.660us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.76% 36.050us 2.20% 45.010us 3.751us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.44% 8.960us 0.44% 8.960us 0.747us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.07% 42.274us 2.07% 42.274us 7.046us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.920us 0.24% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.430us 1391.53% 350.430us 350.430us 1 + hf_kernels_rotary 8.29% 166.207us 99.75% 1.999ms 1.999ms 0.000us 0.00% 26.527us 26.527us 1 + _rotary_dba7d1e::apply_rotary 2.23% 44.751us 4.45% 89.251us 14.875us 17.247us 68.49% 17.247us 2.874us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.247us 68.49% 17.247us 2.874us 6 + aten::clone 1.13% 22.610us 84.88% 1.701ms 283.499us 0.000us 0.00% 9.280us 1.547us 6 + aten::copy_ 1.92% 38.531us 82.21% 1.647ms 274.571us 7.936us 31.51% 9.280us 1.547us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 31.51% 7.936us 1.323us 6 + Activity Buffer Request 70.76% 1.418ms 70.76% 1.418ms 1.418ms 1.344us 5.34% 1.344us 1.344us 1 + aten::empty_strided 1.54% 30.960us 1.54% 30.960us 5.160us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.53% 190.904us 9.53% 190.904us 31.817us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.66% 33.300us 2.12% 42.550us 3.546us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.46% 9.250us 0.46% 9.250us 0.771us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.22% 44.500us 2.22% 44.500us 7.417us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.25% 4.920us 0.25% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.046ms -Self CUDA time total: 25.150us +Self CPU time total: 2.004ms +Self CUDA time total: 25.183us @@ -4119,23 +4119,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 347.166us 1351.10% 347.166us 347.166us 1 - hf_kernels_rotary 21.36% 176.235us 99.42% 820.279us 820.279us 0.000us 0.00% 27.039us 27.039us 1 - _rotary_dba7d1e::apply_rotary 5.20% 42.901us 10.31% 85.044us 14.174us 17.951us 69.86% 17.951us 2.992us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 17.951us 69.86% 17.951us 2.992us 6 - aten::clone 2.62% 21.601us 62.49% 515.608us 85.935us 0.000us 0.00% 9.088us 1.515us 6 - aten::copy_ 4.36% 35.950us 55.96% 461.697us 76.950us 7.744us 30.14% 9.088us 1.515us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 30.14% 7.744us 1.291us 6 - Activity Buffer Request 27.88% 230.028us 27.88% 230.028us 230.028us 1.344us 5.23% 1.344us 1.344us 1 - aten::empty_strided 3.92% 32.310us 3.92% 32.310us 5.385us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 23.72% 195.719us 23.72% 195.719us 32.620us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.18% 34.481us 5.26% 43.392us 3.616us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.08% 8.911us 1.08% 8.911us 0.743us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.11% 42.143us 5.11% 42.143us 7.024us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.821us 0.58% 4.821us 4.821us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.057us 1363.40% 352.057us 352.057us 1 + hf_kernels_rotary 8.30% 165.062us 99.73% 1.982ms 1.982ms 0.000us 0.00% 27.166us 27.166us 1 + _rotary_dba7d1e::apply_rotary 2.21% 43.981us 4.47% 88.794us 14.799us 18.046us 69.89% 18.046us 3.008us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 18.046us 69.89% 18.046us 3.008us 6 + aten::clone 1.14% 22.690us 84.76% 1.685ms 280.783us 0.000us 0.00% 9.120us 1.520us 6 + aten::copy_ 1.83% 36.352us 82.00% 1.630ms 271.644us 7.776us 30.11% 9.120us 1.520us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 30.11% 7.776us 1.296us 6 + Activity Buffer Request 70.82% 1.408ms 70.82% 1.408ms 1.408ms 1.344us 5.20% 1.344us 1.344us 1 + aten::empty_strided 1.62% 32.140us 1.62% 32.140us 5.357us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.35% 185.845us 9.35% 185.845us 30.974us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.71% 33.929us 2.19% 43.590us 3.632us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.49% 9.661us 0.49% 9.661us 0.805us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.25% 44.813us 2.25% 44.813us 7.469us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.451us 0.27% 5.451us 5.451us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 825.100us -Self CUDA time total: 25.695us +Self CPU time total: 1.988ms +Self CUDA time total: 25.822us @@ -4145,23 +4145,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 348.595us 1078.61% 348.595us 348.595us 1 - hf_kernels_rotary 21.56% 162.014us 99.35% 746.516us 746.516us 0.000us 0.00% 34.111us 34.111us 1 - _rotary_dba7d1e::apply_rotary 5.56% 41.814us 11.41% 85.705us 14.284us 21.792us 67.43% 21.792us 3.632us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.792us 67.43% 21.792us 3.632us 6 - aten::clone 2.84% 21.362us 60.59% 455.236us 75.873us 0.000us 0.00% 12.319us 2.053us 6 - aten::copy_ 5.05% 37.942us 53.37% 401.033us 66.839us 10.527us 32.57% 12.319us 2.053us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.527us 32.57% 10.527us 1.755us 6 - Activity Buffer Request 22.09% 165.945us 22.09% 165.945us 165.945us 1.792us 5.54% 1.792us 1.792us 1 - aten::empty_strided 4.37% 32.841us 4.37% 32.841us 5.474us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 26.24% 197.146us 26.24% 197.146us 32.858us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.61% 34.610us 5.80% 43.561us 3.630us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.19% 8.951us 1.19% 8.951us 0.746us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.84% 43.891us 5.84% 43.891us 7.315us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.65% 4.870us 0.65% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 358.939us 1106.16% 358.939us 358.939us 1 + hf_kernels_rotary 8.22% 166.055us 99.77% 2.015ms 2.015ms 0.000us 0.00% 34.209us 34.209us 1 + _rotary_dba7d1e::apply_rotary 2.37% 47.870us 4.45% 89.851us 14.975us 21.952us 67.65% 21.952us 3.659us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.952us 67.65% 21.952us 3.659us 6 + aten::clone 1.14% 23.001us 84.91% 1.715ms 285.805us 0.000us 0.00% 12.257us 2.043us 6 + aten::copy_ 1.81% 36.630us 82.10% 1.658ms 276.333us 10.497us 32.35% 12.257us 2.043us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.497us 32.35% 10.497us 1.750us 6 + Activity Buffer Request 71.15% 1.437ms 71.15% 1.437ms 1.437ms 1.760us 5.42% 1.760us 1.760us 1 + aten::empty_strided 1.68% 33.831us 1.68% 33.831us 5.639us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.14% 184.505us 9.14% 184.505us 30.751us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.72% 34.708us 2.18% 44.050us 3.671us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.46% 9.342us 0.46% 9.342us 0.778us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.08% 41.981us 2.08% 41.981us 6.997us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.681us 0.23% 4.681us 4.681us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 751.386us -Self CUDA time total: 32.319us +Self CPU time total: 2.019ms +Self CUDA time total: 32.449us @@ -4171,23 +4171,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.018us 687.35% 353.018us 353.018us 1 - hf_kernels_rotary 20.18% 167.279us 99.43% 824.358us 824.358us 0.000us 0.00% 54.175us 54.175us 1 - _rotary_dba7d1e::apply_rotary 5.18% 42.971us 10.43% 86.461us 14.410us 34.432us 67.04% 34.432us 5.739us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.432us 67.04% 34.432us 5.739us 6 - aten::clone 2.72% 22.563us 63.67% 527.908us 87.985us 0.000us 0.00% 19.743us 3.290us 6 - aten::copy_ 4.40% 36.441us 57.12% 473.605us 78.934us 16.927us 32.96% 19.743us 3.290us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.927us 32.96% 16.927us 2.821us 6 - Activity Buffer Request 29.36% 243.449us 29.36% 243.449us 243.449us 2.816us 5.48% 2.816us 2.816us 1 - aten::empty_strided 3.83% 31.740us 3.83% 31.740us 5.290us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 23.37% 193.715us 23.37% 193.715us 32.286us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.09% 33.928us 5.15% 42.710us 3.559us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.06% 8.782us 1.06% 8.782us 0.732us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.25% 43.490us 5.25% 43.490us 7.248us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.720us 0.57% 4.720us 4.720us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 358.237us 692.31% 358.237us 358.237us 1 + hf_kernels_rotary 8.27% 167.723us 99.73% 2.023ms 2.023ms 0.000us 0.00% 54.593us 54.593us 1 + _rotary_dba7d1e::apply_rotary 2.25% 45.682us 4.44% 90.052us 15.009us 34.785us 67.22% 34.785us 5.798us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.785us 67.22% 34.785us 5.798us 6 + aten::clone 1.16% 23.462us 84.91% 1.722ms 287.005us 0.000us 0.00% 19.808us 3.301us 6 + aten::copy_ 1.80% 36.481us 82.11% 1.665ms 277.534us 16.960us 32.78% 19.808us 3.301us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 32.78% 16.960us 2.827us 6 + Activity Buffer Request 71.22% 1.444ms 71.22% 1.444ms 1.444ms 2.848us 5.50% 2.848us 2.848us 1 + aten::empty_strided 1.65% 33.360us 1.65% 33.360us 5.560us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.09% 184.354us 9.09% 184.354us 30.726us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.63% 33.070us 2.11% 42.771us 3.564us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.48% 9.701us 0.48% 9.701us 0.808us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.19% 44.370us 2.19% 44.370us 7.395us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.380us 0.27% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 829.078us -Self CUDA time total: 51.359us +Self CPU time total: 2.028ms +Self CUDA time total: 51.745us @@ -4197,23 +4197,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 380.387us 1176.94% 380.387us 380.387us 1 - hf_kernels_rotary 9.88% 201.876us 99.77% 2.039ms 2.039ms 0.000us 0.00% 34.144us 34.144us 1 - _rotary_dba7d1e::apply_rotary 2.25% 45.971us 4.47% 91.374us 15.229us 21.760us 67.33% 21.760us 3.627us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.760us 67.33% 21.760us 3.627us 6 - aten::clone 1.35% 27.641us 83.24% 1.701ms 283.513us 0.000us 0.00% 12.384us 2.064us 6 - aten::copy_ 1.82% 37.221us 80.29% 1.641ms 273.476us 10.560us 32.67% 12.384us 2.064us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 32.67% 10.560us 1.760us 6 - Activity Buffer Request 69.28% 1.416ms 69.28% 1.416ms 1.416ms 1.824us 5.64% 1.824us 1.824us 1 - aten::empty_strided 1.59% 32.582us 1.59% 32.582us 5.430us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.19% 187.866us 9.19% 187.866us 31.311us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.75% 35.720us 2.18% 44.611us 3.718us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.44% 8.891us 0.44% 8.891us 0.741us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.22% 45.403us 2.22% 45.403us 7.567us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.671us 0.23% 4.671us 4.671us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.119us 1074.87% 349.119us 349.119us 1 + hf_kernels_rotary 19.04% 160.903us 99.46% 840.408us 840.408us 0.000us 0.00% 34.304us 34.304us 1 + _rotary_dba7d1e::apply_rotary 5.13% 43.361us 10.44% 88.182us 14.697us 21.824us 67.19% 21.824us 3.637us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 21.824us 67.19% 21.824us 3.637us 6 + aten::clone 2.59% 21.862us 64.90% 548.403us 91.400us 0.000us 0.00% 12.480us 2.080us 6 + aten::copy_ 4.23% 35.750us 58.48% 494.121us 82.353us 10.656us 32.81% 12.480us 2.080us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.656us 32.81% 10.656us 1.776us 6 + Activity Buffer Request 32.24% 272.376us 32.24% 272.376us 272.376us 1.824us 5.62% 1.824us 1.824us 1 + aten::empty_strided 3.84% 32.420us 3.84% 32.420us 5.403us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.01% 185.995us 22.01% 185.995us 30.999us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.96% 33.479us 5.08% 42.920us 3.577us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.12% 9.441us 1.12% 9.441us 0.787us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.30% 44.821us 5.30% 44.821us 7.470us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.54% 4.540us 0.54% 4.540us 4.540us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.044ms -Self CUDA time total: 32.320us +Self CPU time total: 844.948us +Self CUDA time total: 32.480us @@ -4223,23 +4223,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 358.145us 697.76% 358.145us 358.145us 1 - hf_kernels_rotary 9.30% 187.776us 99.78% 2.015ms 2.015ms 0.000us 0.00% 54.208us 54.208us 1 - _rotary_dba7d1e::apply_rotary 2.06% 41.530us 4.25% 85.754us 14.292us 34.401us 67.02% 34.401us 5.734us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.401us 67.02% 34.401us 5.734us 6 - aten::clone 1.47% 29.652us 84.14% 1.699ms 283.188us 0.000us 0.00% 19.807us 3.301us 6 - aten::copy_ 1.88% 38.042us 81.10% 1.638ms 272.963us 16.927us 32.98% 19.807us 3.301us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.927us 32.98% 16.927us 2.821us 6 - Activity Buffer Request 70.14% 1.416ms 70.14% 1.416ms 1.416ms 2.880us 5.61% 2.880us 2.880us 1 - aten::empty_strided 1.57% 31.700us 1.57% 31.700us 5.283us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.08% 183.316us 9.08% 183.316us 30.553us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.65% 33.410us 2.09% 42.241us 3.520us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.44% 8.831us 0.44% 8.831us 0.736us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.19% 44.224us 2.19% 44.224us 7.371us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 4.480us 0.22% 4.480us 4.480us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 348.887us 674.27% 348.887us 348.887us 1 + hf_kernels_rotary 19.09% 159.564us 99.38% 830.748us 830.748us 0.000us 0.00% 54.623us 54.623us 1 + _rotary_dba7d1e::apply_rotary 5.35% 44.752us 10.58% 88.432us 14.739us 34.688us 67.04% 34.688us 5.781us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 34.688us 67.04% 34.688us 5.781us 6 + aten::clone 2.54% 21.199us 64.56% 539.711us 89.952us 0.000us 0.00% 19.935us 3.323us 6 + aten::copy_ 4.41% 36.861us 58.35% 487.801us 81.300us 17.055us 32.96% 19.935us 3.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.055us 32.96% 17.055us 2.843us 6 + Activity Buffer Request 32.25% 269.616us 32.25% 269.616us 269.616us 2.880us 5.57% 2.880us 2.880us 1 + aten::empty_strided 3.67% 30.711us 3.67% 30.711us 5.119us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.69% 181.324us 21.69% 181.324us 30.221us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.02% 33.622us 5.15% 43.041us 3.587us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.13% 9.419us 1.13% 9.419us 0.785us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.23% 43.680us 5.23% 43.680us 7.280us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.190us 0.62% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.019ms -Self CUDA time total: 51.328us +Self CPU time total: 835.938us +Self CUDA time total: 51.743us @@ -4249,23 +4249,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 361.565us 334.59% 361.565us 361.565us 1 - hf_kernels_rotary 8.80% 177.873us 99.76% 2.017ms 2.017ms 0.000us 0.00% 126.174us 126.174us 1 - aten::clone 1.36% 27.530us 84.48% 1.708ms 284.721us 0.000us 0.00% 69.727us 11.621us 6 - aten::copy_ 1.83% 37.081us 81.46% 1.647ms 274.541us 51.615us 47.76% 69.727us 11.621us 6 - _rotary_dba7d1e::apply_rotary 2.15% 43.402us 4.34% 87.665us 14.611us 56.447us 52.24% 56.447us 9.408us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 56.447us 52.24% 56.447us 9.408us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.615us 47.76% 51.615us 8.603us 6 - Activity Buffer Request 70.51% 1.426ms 70.51% 1.426ms 1.426ms 18.112us 16.76% 18.112us 18.112us 1 - aten::empty_strided 1.66% 33.551us 1.66% 33.551us 5.592us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.12% 184.328us 9.12% 184.328us 30.721us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.73% 34.962us 2.15% 43.472us 3.623us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.42% 8.510us 0.42% 8.510us 0.709us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.19% 44.263us 2.19% 44.263us 7.377us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.810us 0.24% 4.810us 4.810us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 359.992us 330.01% 359.992us 359.992us 1 + hf_kernels_rotary 18.74% 161.775us 99.39% 857.819us 857.819us 0.000us 0.00% 127.645us 127.645us 1 + aten::clone 2.55% 22.001us 65.34% 563.953us 93.992us 0.000us 0.00% 70.878us 11.813us 6 + aten::copy_ 4.20% 36.220us 58.95% 508.752us 84.792us 52.319us 47.96% 70.878us 11.813us 6 + _rotary_dba7d1e::apply_rotary 5.00% 43.130us 10.29% 88.850us 14.808us 56.767us 52.04% 56.767us 9.461us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 56.767us 52.04% 56.767us 9.461us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.319us 47.96% 52.319us 8.720us 6 + Activity Buffer Request 31.55% 272.266us 31.55% 272.266us 272.266us 18.559us 17.01% 18.559us 18.559us 1 + aten::empty_strided 3.85% 33.200us 3.85% 33.200us 5.533us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.20% 200.266us 23.20% 200.266us 33.378us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.92% 33.791us 5.01% 43.241us 3.603us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.09% 9.450us 1.09% 9.450us 0.788us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.30% 45.720us 5.30% 45.720us 7.620us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.61% 5.231us 0.61% 5.231us 5.231us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.022ms -Self CUDA time total: 108.062us +Self CPU time total: 863.050us +Self CUDA time total: 109.086us @@ -4275,23 +4275,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 374.332us 209.83% 374.332us 374.332us 1 - hf_kernels_rotary 8.69% 176.335us 99.78% 2.024ms 2.024ms 0.000us 0.00% 202.046us 202.046us 1 - aten::clone 1.35% 27.382us 84.12% 1.707ms 284.468us 0.000us 0.00% 102.112us 17.019us 6 - aten::copy_ 1.89% 38.342us 81.18% 1.647ms 274.513us 78.464us 43.98% 102.112us 17.019us 6 - _rotary_dba7d1e::apply_rotary 2.26% 45.922us 4.48% 90.874us 15.146us 99.934us 56.02% 99.934us 16.656us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 99.934us 56.02% 99.934us 16.656us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.464us 43.98% 78.464us 13.077us 6 - Activity Buffer Request 70.36% 1.428ms 70.36% 1.428ms 1.428ms 23.648us 13.26% 23.648us 23.648us 1 - aten::empty_strided 1.59% 32.350us 1.59% 32.350us 5.392us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.93% 181.117us 8.93% 181.117us 30.186us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.68% 34.110us 2.48% 50.391us 4.199us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.80% 16.281us 0.80% 16.281us 1.357us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.22% 44.952us 2.22% 44.952us 7.492us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.22% 4.521us 0.22% 4.521us 4.521us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 362.974us 203.46% 362.974us 362.974us 1 + hf_kernels_rotary 18.77% 159.855us 99.40% 846.699us 846.699us 0.000us 0.00% 202.112us 202.112us 1 + aten::clone 2.60% 22.140us 65.34% 556.581us 92.764us 0.000us 0.00% 102.335us 17.056us 6 + aten::copy_ 6.25% 53.212us 58.90% 501.700us 83.617us 78.623us 44.07% 102.335us 17.056us 6 + _rotary_dba7d1e::apply_rotary 5.25% 44.680us 10.37% 88.362us 14.727us 99.777us 55.93% 99.777us 16.630us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 99.777us 55.93% 99.777us 16.630us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 78.623us 44.07% 78.623us 13.104us 6 + Activity Buffer Request 31.35% 267.035us 31.35% 267.035us 267.035us 23.712us 13.29% 23.712us 23.712us 1 + aten::empty_strided 3.84% 32.741us 3.84% 32.741us 5.457us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.30% 181.453us 21.30% 181.453us 30.242us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.81% 32.449us 4.92% 41.901us 3.492us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.11% 9.452us 1.11% 9.452us 0.788us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.13% 43.682us 5.13% 43.682us 7.280us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.60% 5.070us 0.60% 5.070us 5.070us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.029ms -Self CUDA time total: 178.398us +Self CPU time total: 851.769us +Self CUDA time total: 178.400us @@ -4301,23 +4301,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.716us 1341.48% 350.716us 350.716us 1 - hf_kernels_rotary 8.88% 178.684us 99.76% 2.007ms 2.007ms 0.000us 0.00% 27.264us 27.264us 1 - _rotary_dba7d1e::apply_rotary 2.16% 43.370us 4.24% 85.224us 14.204us 19.393us 74.18% 19.393us 3.232us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.393us 74.18% 19.393us 3.232us 6 - aten::clone 1.56% 31.330us 84.58% 1.702ms 283.596us 0.000us 0.00% 7.871us 1.312us 6 - aten::copy_ 1.80% 36.292us 81.38% 1.637ms 272.881us 6.751us 25.82% 7.871us 1.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.751us 25.82% 6.751us 1.125us 6 - Activity Buffer Request 70.41% 1.417ms 70.41% 1.417ms 1.417ms 1.120us 4.28% 1.120us 1.120us 1 - aten::empty_strided 1.64% 32.961us 1.64% 32.961us 5.494us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.17% 184.457us 9.17% 184.457us 30.743us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.63% 32.712us 2.06% 41.532us 3.461us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.44% 8.820us 0.44% 8.820us 0.735us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.08% 41.854us 2.08% 41.854us 6.976us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.24% 4.830us 0.24% 4.830us 4.830us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 360.442us 1380.42% 360.442us 360.442us 1 + hf_kernels_rotary 18.94% 160.272us 99.38% 841.148us 841.148us 0.000us 0.00% 27.231us 27.231us 1 + _rotary_dba7d1e::apply_rotary 6.35% 53.781us 11.54% 97.693us 16.282us 19.328us 74.02% 19.328us 3.221us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.328us 74.02% 19.328us 3.221us 6 + aten::clone 2.61% 22.063us 63.73% 539.383us 89.897us 0.000us 0.00% 7.903us 1.317us 6 + aten::copy_ 4.50% 38.070us 57.45% 486.201us 81.033us 6.783us 25.98% 7.903us 1.317us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 25.98% 6.783us 1.130us 6 + Activity Buffer Request 31.33% 265.156us 31.33% 265.156us 265.156us 1.120us 4.29% 1.120us 1.120us 1 + aten::empty_strided 3.68% 31.119us 3.68% 31.119us 5.186us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.62% 182.975us 21.62% 182.975us 30.496us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.07% 34.449us 5.18% 43.800us 3.650us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.10% 9.351us 1.10% 9.351us 0.779us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.19% 43.912us 5.19% 43.912us 7.319us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.210us 0.62% 5.210us 5.210us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.012ms -Self CUDA time total: 26.144us +Self CPU time total: 846.358us +Self CUDA time total: 26.111us @@ -4327,22 +4327,22 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 377.307us 1377.44% 377.307us 377.307us 1 - hf_kernels_rotary 21.29% 163.294us 99.28% 761.426us 761.426us 0.000us 0.00% 28.704us 28.704us 1 - _rotary_dba7d1e::apply_rotary 5.68% 43.540us 11.49% 88.163us 14.694us 19.584us 71.50% 19.584us 3.264us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.584us 71.50% 19.584us 3.264us 6 - aten::clone 3.08% 23.620us 60.95% 467.436us 77.906us 0.000us 0.00% 9.120us 1.520us 6 - aten::copy_ 5.00% 38.311us 53.59% 411.005us 68.501us 7.808us 28.50% 9.120us 1.520us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 28.50% 7.808us 1.301us 6 - Activity Buffer Request 21.08% 161.645us 21.08% 161.645us 161.645us 1.312us 4.79% 1.312us 1.312us 1 - aten::empty_strided 4.28% 32.811us 4.28% 32.811us 5.468us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 27.52% 211.049us 27.52% 211.049us 35.175us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.33% 33.234us 5.55% 42.533us 3.544us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.21% 9.299us 1.21% 9.299us 0.775us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.82% 44.623us 5.82% 44.623us 7.437us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.72% 5.550us 0.72% 5.550us 5.550us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.461us 1264.83% 346.461us 346.461us 1 + hf_kernels_rotary 20.09% 160.625us 99.34% 794.228us 794.228us 0.000us 0.00% 28.704us 28.704us 1 + _rotary_dba7d1e::apply_rotary 5.81% 46.461us 11.03% 88.152us 14.692us 19.583us 71.49% 19.583us 3.264us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 19.583us 71.49% 19.583us 3.264us 6 + aten::clone 2.69% 21.471us 62.97% 503.431us 83.905us 0.000us 0.00% 9.121us 1.520us 6 + aten::copy_ 4.41% 35.231us 56.20% 449.330us 74.888us 7.809us 28.51% 9.121us 1.520us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.809us 28.51% 7.809us 1.301us 6 + Activity Buffer Request 29.01% 231.915us 29.01% 231.915us 231.915us 1.312us 4.79% 1.312us 1.312us 1 + aten::empty_strided 4.08% 32.630us 4.08% 32.630us 5.438us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 22.79% 182.184us 22.79% 182.184us 30.364us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.13% 33.049us 5.26% 42.020us 3.502us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.12% 8.971us 1.12% 8.971us 0.748us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.21% 41.691us 5.21% 41.691us 6.949us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.66% 5.270us 0.66% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 766.976us +Self CPU time total: 799.498us Self CUDA time total: 27.392us @@ -4353,23 +4353,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.153us 1234.28% 349.153us 349.153us 1 - hf_kernels_rotary 19.50% 158.266us 99.38% 806.788us 806.788us 0.000us 0.00% 29.600us 29.600us 1 - _rotary_dba7d1e::apply_rotary 5.36% 43.530us 10.78% 87.514us 14.586us 20.544us 72.62% 20.544us 3.424us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.544us 72.62% 20.544us 3.424us 6 - aten::clone 2.63% 21.380us 63.75% 517.547us 86.258us 0.000us 0.00% 9.056us 1.509us 6 - aten::copy_ 4.60% 37.352us 57.23% 464.607us 77.434us 7.744us 27.38% 9.056us 1.509us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 27.38% 7.744us 1.291us 6 - Activity Buffer Request 29.79% 241.838us 29.79% 241.838us 241.838us 1.312us 4.64% 1.312us 1.312us 1 - aten::empty_strided 3.89% 31.560us 3.89% 31.560us 5.260us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.84% 185.417us 22.84% 185.417us 30.903us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.24% 34.459us 5.35% 43.461us 3.622us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.11% 9.002us 1.11% 9.002us 0.750us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.42% 43.984us 5.42% 43.984us 7.331us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.62% 5.020us 0.62% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 349.309us 1232.09% 349.309us 349.309us 1 + hf_kernels_rotary 8.10% 161.823us 99.77% 1.992ms 1.992ms 0.000us 0.00% 29.663us 29.663us 1 + _rotary_dba7d1e::apply_rotary 2.22% 44.350us 4.33% 86.563us 14.427us 20.575us 72.57% 20.575us 3.429us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.575us 72.57% 20.575us 3.429us 6 + aten::clone 1.14% 22.731us 85.22% 1.702ms 283.649us 0.000us 0.00% 9.088us 1.515us 6 + aten::copy_ 1.87% 37.281us 82.45% 1.647ms 274.431us 7.776us 27.43% 9.088us 1.515us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 27.43% 7.776us 1.296us 6 + Activity Buffer Request 71.45% 1.427ms 71.45% 1.427ms 1.427ms 1.312us 4.63% 1.312us 1.312us 1 + aten::empty_strided 1.63% 32.581us 1.63% 32.581us 5.430us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.14% 182.513us 9.14% 182.513us 30.419us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.66% 33.090us 2.10% 42.030us 3.503us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.45% 8.940us 0.45% 8.940us 0.745us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.11% 42.213us 2.11% 42.213us 7.035us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.23% 4.691us 0.23% 4.691us 4.691us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 811.808us -Self CUDA time total: 28.288us +Self CPU time total: 1.997ms +Self CUDA time total: 28.351us @@ -4379,23 +4379,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 348.027us 976.29% 348.027us 348.027us 1 - hf_kernels_rotary 20.53% 156.455us 99.34% 757.166us 757.166us 0.000us 0.00% 37.440us 37.440us 1 - _rotary_dba7d1e::apply_rotary 5.63% 42.881us 11.27% 85.894us 14.316us 25.184us 70.65% 25.184us 4.197us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.184us 70.65% 25.184us 4.197us 6 - aten::clone 3.00% 22.853us 61.65% 469.877us 78.313us 0.000us 0.00% 12.256us 2.043us 6 - aten::copy_ 4.74% 36.121us 54.50% 415.394us 69.232us 10.464us 29.35% 12.256us 2.043us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.464us 29.35% 10.464us 1.744us 6 - Activity Buffer Request 25.88% 197.217us 25.88% 197.217us 197.217us 1.792us 5.03% 1.792us 1.792us 1 - aten::empty_strided 4.15% 31.630us 4.15% 31.630us 5.272us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 23.89% 182.056us 23.89% 182.056us 30.343us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.53% 34.528us 5.90% 44.940us 3.745us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.37% 10.412us 1.37% 10.412us 0.868us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.64% 43.013us 5.64% 43.013us 7.169us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.66% 5.020us 0.66% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.717us 965.68% 346.717us 346.717us 1 + hf_kernels_rotary 8.08% 159.825us 99.74% 1.973ms 1.973ms 0.000us 0.00% 37.696us 37.696us 1 + _rotary_dba7d1e::apply_rotary 2.36% 46.731us 4.45% 88.052us 14.675us 25.344us 70.59% 25.344us 4.224us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.344us 70.59% 25.344us 4.224us 6 + aten::clone 1.15% 22.657us 85.03% 1.682ms 280.364us 0.000us 0.00% 12.352us 2.059us 6 + aten::copy_ 1.85% 36.611us 82.30% 1.628ms 271.377us 10.560us 29.41% 12.352us 2.059us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 29.41% 10.560us 1.760us 6 + Activity Buffer Request 71.35% 1.412ms 71.35% 1.412ms 1.412ms 1.792us 4.99% 1.792us 1.792us 1 + aten::empty_strided 1.58% 31.262us 1.58% 31.262us 5.210us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.10% 180.014us 9.10% 180.014us 30.002us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.71% 33.773us 2.18% 43.112us 3.593us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.47% 9.339us 0.47% 9.339us 0.778us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.09% 41.321us 2.09% 41.321us 6.887us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.26% 5.170us 0.26% 5.170us 5.170us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 762.186us -Self CUDA time total: 35.648us +Self CPU time total: 1.978ms +Self CUDA time total: 35.904us @@ -4405,23 +4405,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 346.012us 1220.37% 346.012us 346.012us 1 - hf_kernels_rotary 19.32% 159.865us 99.40% 822.269us 822.269us 0.000us 0.00% 29.665us 29.665us 1 - _rotary_dba7d1e::apply_rotary 5.23% 43.230us 10.32% 85.383us 14.231us 20.577us 72.57% 20.577us 3.429us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.577us 72.57% 20.577us 3.429us 6 - aten::clone 2.67% 22.091us 64.52% 533.759us 88.960us 0.000us 0.00% 9.088us 1.515us 6 - aten::copy_ 4.35% 36.002us 57.93% 479.208us 79.868us 7.776us 27.43% 9.088us 1.515us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 27.43% 7.776us 1.296us 6 - Activity Buffer Request 31.47% 260.369us 31.47% 260.369us 260.369us 1.312us 4.63% 1.312us 1.312us 1 - aten::empty_strided 3.92% 32.460us 3.92% 32.460us 5.410us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.10% 182.837us 22.10% 182.837us 30.473us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.12% 34.091us 5.23% 43.262us 3.605us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.11% 9.171us 1.11% 9.171us 0.764us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.10% 42.153us 5.10% 42.153us 7.026us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.60% 4.990us 0.60% 4.990us 4.990us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 379.517us 1337.13% 379.517us 379.517us 1 + hf_kernels_rotary 9.04% 183.063us 99.73% 2.019ms 2.019ms 0.000us 0.00% 29.695us 29.695us 1 + _rotary_dba7d1e::apply_rotary 2.30% 46.590us 4.55% 92.183us 15.364us 20.640us 72.72% 20.640us 3.440us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 20.640us 72.72% 20.640us 3.440us 6 + aten::clone 1.11% 22.532us 83.96% 1.699ms 283.211us 0.000us 0.00% 9.055us 1.509us 6 + aten::copy_ 1.86% 37.591us 81.15% 1.642ms 273.739us 7.743us 27.28% 9.055us 1.509us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.743us 27.28% 7.743us 1.290us 6 + Activity Buffer Request 70.05% 1.418ms 70.05% 1.418ms 1.418ms 1.312us 4.62% 1.312us 1.312us 1 + aten::empty_strided 1.69% 34.300us 1.69% 34.300us 5.717us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.24% 187.074us 9.24% 187.074us 31.179us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.69% 34.120us 2.18% 44.030us 3.669us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.49% 9.910us 0.49% 9.910us 0.826us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 2.25% 45.593us 2.25% 45.593us 7.599us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.27% 5.440us 0.27% 5.440us 5.440us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 827.259us -Self CUDA time total: 28.353us +Self CPU time total: 2.024ms +Self CUDA time total: 28.383us @@ -4431,23 +4431,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 379.611us 1063.04% 379.611us 379.611us 1 - hf_kernels_rotary 17.54% 182.966us 99.53% 1.038ms 1.038ms 0.000us 0.00% 37.470us 37.470us 1 - _rotary_dba7d1e::apply_rotary 4.31% 44.959us 8.52% 88.913us 14.819us 25.247us 70.70% 25.247us 4.208us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.247us 70.70% 25.247us 4.208us 6 - aten::clone 2.14% 22.291us 69.13% 721.275us 120.212us 0.000us 0.00% 12.223us 2.037us 6 - aten::copy_ 3.58% 37.312us 63.91% 666.784us 111.131us 10.463us 29.30% 12.223us 2.037us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.463us 29.30% 10.463us 1.744us 6 - Activity Buffer Request 42.63% 444.746us 42.63% 444.746us 444.746us 1.760us 4.93% 1.760us 1.760us 1 - aten::empty_strided 3.09% 32.200us 3.09% 32.200us 5.367us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.71% 184.726us 17.71% 184.726us 30.788us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.45% 36.000us 4.33% 45.221us 3.768us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.88% 9.221us 0.88% 9.221us 0.768us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.21% 43.954us 4.21% 43.954us 7.326us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.47% 4.940us 0.47% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 343.101us 955.55% 343.101us 343.101us 1 + hf_kernels_rotary 21.17% 156.032us 99.29% 731.736us 731.736us 0.000us 0.00% 37.666us 37.666us 1 + _rotary_dba7d1e::apply_rotary 6.10% 44.981us 11.95% 88.102us 14.684us 25.410us 70.77% 25.410us 4.235us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 25.410us 70.77% 25.410us 4.235us 6 + aten::clone 2.78% 20.462us 60.43% 445.371us 74.229us 0.000us 0.00% 12.256us 2.043us 6 + aten::copy_ 4.78% 35.221us 53.31% 392.899us 65.483us 10.496us 29.23% 12.256us 2.043us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 29.23% 10.496us 1.749us 6 + Activity Buffer Request 23.63% 174.154us 23.63% 174.154us 174.154us 1.760us 4.90% 1.760us 1.760us 1 + aten::empty_strided 4.34% 32.010us 4.34% 32.010us 5.335us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 24.90% 183.524us 24.90% 183.524us 30.587us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.46% 32.889us 5.73% 42.231us 3.519us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.27% 9.342us 1.27% 9.342us 0.779us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.85% 43.121us 5.85% 43.121us 7.187us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.71% 5.231us 0.71% 5.231us 5.231us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.043ms -Self CUDA time total: 35.710us +Self CPU time total: 736.967us +Self CUDA time total: 35.906us @@ -4457,23 +4457,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.330us 621.69% 350.330us 350.330us 1 - hf_kernels_rotary 20.69% 166.654us 99.40% 800.657us 800.657us 0.000us 0.00% 59.231us 59.231us 1 - _rotary_dba7d1e::apply_rotary 5.43% 43.738us 10.71% 86.292us 14.382us 39.327us 69.79% 39.327us 6.554us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.327us 69.79% 39.327us 6.554us 6 - aten::clone 2.60% 20.920us 62.50% 503.467us 83.911us 0.000us 0.00% 19.904us 3.317us 6 - aten::copy_ 4.42% 35.631us 55.79% 449.427us 74.904us 17.024us 30.21% 19.904us 3.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 30.21% 17.024us 2.837us 6 - Activity Buffer Request 28.71% 231.299us 28.71% 231.299us 231.299us 2.880us 5.11% 2.880us 2.880us 1 - aten::empty_strided 4.11% 33.120us 4.11% 33.120us 5.520us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.66% 182.497us 22.66% 182.497us 30.416us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.34% 34.964us 5.49% 44.244us 3.687us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.15% 9.280us 1.15% 9.280us 0.773us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.28% 42.554us 5.28% 42.554us 7.092us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.60% 4.850us 0.60% 4.850us 4.850us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 352.508us 623.07% 352.508us 352.508us 1 + hf_kernels_rotary 16.04% 161.055us 99.46% 998.502us 998.502us 0.000us 0.00% 59.488us 59.488us 1 + _rotary_dba7d1e::apply_rotary 4.50% 45.140us 8.76% 87.902us 14.650us 39.520us 69.85% 39.520us 6.587us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.520us 69.85% 39.520us 6.587us 6 + aten::clone 2.15% 21.591us 70.51% 707.875us 117.979us 0.000us 0.00% 19.968us 3.328us 6 + aten::copy_ 3.70% 37.171us 65.07% 653.264us 108.877us 17.056us 30.15% 19.968us 3.328us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.056us 30.15% 17.056us 2.843us 6 + Activity Buffer Request 41.99% 421.539us 41.99% 421.539us 421.539us 2.912us 5.15% 2.912us 2.912us 1 + aten::empty_strided 3.29% 33.020us 3.29% 33.020us 5.503us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 19.38% 194.554us 19.38% 194.554us 32.426us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.26% 32.759us 4.15% 41.670us 3.472us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.89% 8.911us 0.89% 8.911us 0.743us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 4.26% 42.762us 4.26% 42.762us 7.127us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.54% 5.441us 0.54% 5.441us 5.441us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 805.507us -Self CUDA time total: 56.351us +Self CPU time total: 1.004ms +Self CUDA time total: 56.576us @@ -4483,23 +4483,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 363.291us 308.26% 363.291us 363.291us 1 - hf_kernels_rotary 19.60% 166.384us 99.43% 844.179us 844.179us 0.000us 0.00% 134.846us 134.846us 1 - aten::clone 2.55% 21.670us 64.54% 547.969us 91.328us 0.000us 0.00% 70.143us 11.691us 6 - aten::copy_ 4.54% 38.561us 58.31% 495.019us 82.503us 53.151us 45.10% 70.143us 11.691us 6 - _rotary_dba7d1e::apply_rotary 4.97% 42.172us 10.27% 87.155us 14.526us 64.703us 54.90% 64.703us 10.784us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.703us 54.90% 64.703us 10.784us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.151us 45.10% 53.151us 8.859us 6 - Activity Buffer Request 32.22% 273.530us 32.22% 273.530us 273.530us 16.992us 14.42% 16.992us 16.992us 1 - aten::empty_strided 3.68% 31.280us 3.68% 31.280us 5.213us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 21.55% 182.928us 21.55% 182.928us 30.488us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.96% 33.580us 5.03% 42.671us 3.556us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.07% 9.091us 1.07% 9.091us 0.758us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 5.30% 44.983us 5.30% 44.983us 7.497us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.57% 4.820us 0.57% 4.820us 4.820us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 387.642us 328.65% 387.642us 387.642us 1 + hf_kernels_rotary 18.94% 166.375us 99.43% 873.639us 873.639us 0.000us 0.00% 134.717us 134.717us 1 + aten::clone 2.49% 21.882us 65.15% 572.462us 95.410us 0.000us 0.00% 69.630us 11.605us 6 + aten::copy_ 4.33% 38.039us 58.86% 517.140us 86.190us 52.863us 44.82% 69.630us 11.605us 6 + _rotary_dba7d1e::apply_rotary 5.22% 45.870us 10.42% 91.531us 15.255us 65.087us 55.18% 65.087us 10.848us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 65.087us 55.18% 65.087us 10.848us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.863us 44.82% 52.863us 8.810us 6 + Activity Buffer Request 30.86% 271.146us 30.86% 271.146us 271.146us 16.767us 14.22% 16.767us 16.767us 1 + aten::empty_strided 3.81% 33.440us 3.81% 33.440us 5.573us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 23.67% 207.955us 23.67% 207.955us 34.659us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.81% 33.492us 4.92% 43.271us 3.606us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.11% 9.779us 1.11% 9.779us 0.815us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.20% 45.661us 5.20% 45.661us 7.610us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.57% 5.000us 0.57% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 848.999us -Self CUDA time total: 117.854us +Self CPU time total: 878.639us +Self CUDA time total: 117.950us @@ -4509,23 +4509,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 370.462us 657.41% 370.462us 370.462us 1 - hf_kernels_rotary 9.39% 189.846us 99.77% 2.018ms 2.018ms 0.000us 0.00% 59.200us 59.200us 1 - _rotary_dba7d1e::apply_rotary 2.15% 43.502us 4.33% 87.525us 14.588us 39.360us 69.85% 39.360us 6.560us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.360us 69.85% 39.360us 6.560us 6 - aten::clone 1.41% 28.463us 83.80% 1.695ms 282.475us 0.000us 0.00% 19.840us 3.307us 6 - aten::copy_ 1.87% 37.890us 80.77% 1.634ms 272.251us 16.992us 30.15% 19.840us 3.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 30.15% 16.992us 2.832us 6 - Activity Buffer Request 69.77% 1.411ms 69.77% 1.411ms 1.411ms 2.848us 5.05% 2.848us 2.848us 1 - aten::empty_strided 1.63% 32.881us 1.63% 32.881us 5.480us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.13% 184.676us 9.13% 184.676us 30.779us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.76% 35.550us 2.25% 45.480us 3.790us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.49% 9.930us 0.49% 9.930us 0.827us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.18% 44.023us 2.18% 44.023us 7.337us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.23% 4.690us 0.23% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 350.008us 617.28% 350.008us 350.008us 1 + hf_kernels_rotary 19.15% 160.263us 99.38% 831.878us 831.878us 0.000us 0.00% 59.582us 59.582us 1 + _rotary_dba7d1e::apply_rotary 5.48% 45.879us 10.51% 87.950us 14.658us 39.551us 69.75% 39.551us 6.592us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 39.551us 69.75% 39.551us 6.592us 6 + aten::clone 2.51% 21.009us 64.67% 541.343us 90.224us 0.000us 0.00% 20.031us 3.338us 6 + aten::copy_ 4.43% 37.081us 58.27% 487.761us 81.294us 17.151us 30.25% 20.031us 3.338us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.151us 30.25% 17.151us 2.858us 6 + Activity Buffer Request 32.40% 271.246us 32.40% 271.246us 271.246us 2.880us 5.08% 2.880us 2.880us 1 + aten::empty_strided 3.89% 32.573us 3.89% 32.573us 5.429us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.44% 179.434us 21.44% 179.434us 29.906us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.94% 32.963us 5.06% 42.322us 3.527us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.12% 9.359us 1.12% 9.359us 0.780us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.03% 42.071us 5.03% 42.071us 7.012us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.200us 0.62% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.022ms -Self CUDA time total: 56.352us +Self CPU time total: 837.078us +Self CUDA time total: 56.702us @@ -4535,23 +4535,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 359.680us 306.26% 359.680us 359.680us 1 - hf_kernels_rotary 9.06% 182.622us 99.75% 2.011ms 2.011ms 0.000us 0.00% 134.753us 134.753us 1 - aten::clone 1.36% 27.350us 84.30% 1.700ms 283.278us 0.000us 0.00% 70.114us 11.686us 6 - aten::copy_ 1.85% 37.232us 81.34% 1.640ms 273.341us 52.802us 44.96% 70.114us 11.686us 6 - _rotary_dba7d1e::apply_rotary 2.09% 42.192us 4.26% 85.926us 14.321us 64.639us 55.04% 64.639us 10.773us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 64.639us 55.04% 64.639us 10.773us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 52.802us 44.96% 52.802us 8.800us 6 - Activity Buffer Request 70.45% 1.420ms 70.45% 1.420ms 1.420ms 17.312us 14.74% 17.312us 17.312us 1 - aten::empty_strided 1.60% 32.271us 1.60% 32.271us 5.379us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.05% 182.507us 9.05% 182.507us 30.418us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 1.67% 33.712us 2.12% 42.832us 3.569us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.45% 9.120us 0.45% 9.120us 0.760us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 2.17% 43.734us 2.17% 43.734us 7.289us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.25% 5.130us 0.25% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 353.498us 298.65% 353.498us 353.498us 1 + hf_kernels_rotary 18.88% 158.066us 99.36% 831.868us 831.868us 0.000us 0.00% 136.124us 136.124us 1 + aten::clone 2.60% 21.739us 64.69% 541.581us 90.263us 0.000us 0.00% 70.911us 11.819us 6 + aten::copy_ 4.33% 36.250us 58.43% 489.151us 81.525us 53.151us 44.90% 70.911us 11.819us 6 + _rotary_dba7d1e::apply_rotary 5.47% 45.759us 10.77% 90.181us 15.030us 65.213us 55.10% 65.213us 10.869us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 65.213us 55.10% 65.213us 10.869us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.151us 44.90% 53.151us 8.859us 6 + Activity Buffer Request 32.62% 273.096us 32.62% 273.096us 273.096us 17.760us 15.00% 17.760us 17.760us 1 + aten::empty_strided 3.67% 30.691us 3.67% 30.691us 5.115us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.48% 179.805us 21.48% 179.805us 29.968us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.93% 32.900us 5.02% 42.040us 3.503us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.09% 9.140us 1.09% 9.140us 0.762us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.31% 44.422us 5.31% 44.422us 7.404us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.64% 5.350us 0.64% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.016ms -Self CUDA time total: 117.441us +Self CPU time total: 837.218us +Self CUDA time total: 118.364us @@ -4561,23 +4561,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 363.604us 186.68% 363.604us 363.604us 1 - hf_kernels_rotary 18.95% 159.454us 99.42% 836.628us 836.628us 0.000us 0.00% 218.425us 218.425us 1 - _rotary_dba7d1e::apply_rotary 5.11% 42.982us 10.01% 84.264us 14.044us 114.460us 58.76% 114.460us 19.077us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 114.460us 58.76% 114.460us 19.077us 6 - aten::clone 2.64% 22.190us 65.28% 549.368us 91.561us 0.000us 0.00% 103.965us 17.328us 6 - aten::copy_ 4.30% 36.168us 58.92% 495.836us 82.639us 80.318us 41.24% 103.965us 17.328us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 80.318us 41.24% 80.318us 13.386us 6 - Activity Buffer Request 32.31% 271.900us 32.31% 271.900us 271.900us 23.647us 12.14% 23.647us 23.647us 1 - aten::empty_strided 3.72% 31.342us 3.72% 31.342us 5.224us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 22.31% 187.768us 22.31% 187.768us 31.295us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.01% 33.772us 5.17% 43.542us 3.628us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 1.16% 9.770us 1.16% 9.770us 0.814us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 4.91% 41.282us 4.91% 41.282us 6.880us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 0.58% 4.880us 0.58% 4.880us 4.880us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 360.988us 184.39% 360.988us 360.988us 1 + hf_kernels_rotary 18.98% 158.685us 99.38% 830.678us 830.678us 0.000us 0.00% 219.452us 219.452us 1 + _rotary_dba7d1e::apply_rotary 5.52% 46.111us 10.69% 89.382us 14.897us 114.877us 58.68% 114.877us 19.146us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 114.877us 58.68% 114.877us 19.146us 6 + aten::clone 2.66% 22.261us 64.58% 539.851us 89.975us 0.000us 0.00% 104.575us 17.429us 6 + aten::copy_ 4.21% 35.161us 58.26% 487.000us 81.167us 80.895us 41.32% 104.575us 17.429us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 80.895us 41.32% 80.895us 13.482us 6 + Activity Buffer Request 32.35% 270.376us 32.35% 270.376us 270.376us 23.680us 12.10% 23.680us 23.680us 1 + aten::empty_strided 3.66% 30.590us 3.66% 30.590us 5.098us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 21.71% 181.463us 21.71% 181.463us 30.244us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 3.99% 33.350us 5.12% 42.760us 3.563us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 1.13% 9.410us 1.13% 9.410us 0.784us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 5.18% 43.271us 5.18% 43.271us 7.212us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 0.62% 5.200us 0.62% 5.200us 5.200us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 841.508us -Self CUDA time total: 194.778us +Self CPU time total: 835.878us +Self CUDA time total: 195.772us @@ -4587,27 +4587,27 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - hf_kernels_rotary 13.69% 161.817us 65.35% 772.637us 772.637us 0.000us 0.00% 853.016us 853.016us 1 - hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 794.618us 101.00% 794.618us 794.618us 1 - aten::clone 1.91% 22.540us 40.85% 482.956us 80.493us 0.000us 0.00% 580.923us 96.820us 6 - aten::copy_ 3.05% 36.119us 36.34% 429.636us 71.606us 514.652us 65.42% 580.923us 96.820us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 514.652us 65.42% 514.652us 85.775us 6 - _rotary_dba7d1e::apply_rotary 3.53% 41.772us 7.15% 84.524us 14.087us 272.093us 34.58% 272.093us 45.349us 6 -void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 272.093us 34.58% 272.093us 45.349us 6 - Activity Buffer Request 17.75% 209.918us 17.75% 209.918us 209.918us 66.271us 8.42% 66.271us 66.271us 1 - aten::empty_strided 2.60% 30.780us 2.60% 30.780us 5.130us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.53% 183.599us 15.53% 183.599us 30.600us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.92% 34.511us 3.67% 43.340us 3.612us 0.000us 0.00% 0.000us 0.000us 12 - aten::as_strided 0.75% 8.829us 0.75% 8.829us 0.736us 0.000us 0.00% 0.000us 0.000us 12 - cudaLaunchKernel 3.62% 42.752us 3.62% 42.752us 7.125us 0.000us 0.00% 0.000us 0.000us 6 - cudaDeviceSynchronize 34.65% 409.744us 34.65% 409.744us 409.744us 0.000us 0.00% 0.000us 0.000us 1 + hf_kernels_rotary 12.86% 162.524us 68.27% 862.699us 862.699us 0.000us 0.00% 851.436us 851.436us 1 + hf_kernels_rotary 0.00% 0.000us 0.00% 0.000us 0.000us 792.815us 101.00% 792.815us 792.815us 1 + aten::clone 1.75% 22.062us 44.83% 566.473us 94.412us 0.000us 0.00% 579.314us 96.552us 6 + aten::copy_ 2.96% 37.431us 40.52% 512.021us 85.337us 512.820us 65.33% 579.314us 96.552us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 512.820us 65.33% 512.820us 85.470us 6 + _rotary_dba7d1e::apply_rotary 3.66% 46.190us 7.09% 89.631us 14.938us 272.122us 34.67% 272.122us 45.354us 6 +void at::native::(anonymous namespace)::unrolled_ele... 0.00% 0.000us 0.00% 0.000us 0.000us 272.122us 34.67% 272.122us 45.354us 6 + Activity Buffer Request 21.88% 276.466us 21.88% 276.466us 276.466us 66.494us 8.47% 66.494us 66.494us 1 + aten::empty_strided 2.56% 32.390us 2.56% 32.390us 5.398us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 15.68% 198.124us 15.68% 198.124us 33.021us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.73% 34.460us 3.49% 44.071us 3.673us 0.000us 0.00% 0.000us 0.000us 12 + aten::as_strided 0.76% 9.611us 0.76% 9.611us 0.801us 0.000us 0.00% 0.000us 0.000us 12 + cudaLaunchKernel 3.44% 43.441us 3.44% 43.441us 7.240us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 31.73% 401.009us 31.73% 401.009us 401.009us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.182ms -Self CUDA time total: 786.745us +Self CPU time total: 1.264ms +Self CUDA time total: 784.942us impl wl p50(ms) ok -hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False +hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.10 False hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.10 False hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False @@ -4618,7 +4618,7 @@ hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 False hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 False +hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.10 False hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False @@ -4626,21 +4626,21 @@ hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 False hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 False -hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 False -hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.10 False +hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.10 False +hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.10 False +hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 False hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False▶ UV Install LogsFetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s]+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.77it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.75it/s]