diff --git "a/rotary/impls/hf_kernels_rotary.html" "b/rotary/impls/hf_kernels_rotary.html" --- "a/rotary/impls/hf_kernels_rotary.html" +++ "b/rotary/impls/hf_kernels_rotary.html" @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.26s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
-
Wed Oct 29 00:36:23 2025       
+
Wed Oct 29 04:13:37 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   31C    P0             86W /  350W |       0MiB /  46068MiB |     22%      Default |
+| N/A   30C    P0            116W /  350W |       0MiB /  46068MiB |     67%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 4.48s
+Cell: benchmark | 8.00s
  | 
 
 Raw
@@ -3989,23 +3989,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     417.663us      1766.17%     417.663us     417.663us             1  
-                                      hf_kernels_rotary        11.92%     243.797us        99.67%       2.039ms       2.039ms       0.000us         0.00%      24.864us      24.864us             1  
-                          _rotary_dba7d1e::apply_rotary         2.64%      54.054us         5.06%     103.576us      17.263us      16.992us        71.85%      16.992us       2.832us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us        71.85%      16.992us       2.832us             6  
-                                            aten::clone         2.02%      41.272us        79.82%       1.633ms     272.116us       0.000us         0.00%       7.872us       1.312us             6  
-                                            aten::copy_         1.82%      37.200us        74.94%       1.533ms     255.467us       6.656us        28.15%       7.872us       1.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us        28.15%       6.656us       1.109us             6  
-                                Activity Buffer Request        69.47%       1.421ms        69.47%       1.421ms       1.421ms       1.216us         5.14%       1.216us       1.216us             1  
-                                    aten::empty_strided         2.87%      58.622us         2.87%      58.622us       9.770us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         3.65%      74.674us         3.65%      74.674us      12.446us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.25%      46.121us         2.87%      58.631us       4.886us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.61%      12.510us         0.61%      12.510us       1.042us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.42%      49.522us         2.42%      49.522us       8.254us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.33%       6.691us         0.33%       6.691us       6.691us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     430.557us      1823.31%     430.557us     430.557us             1  
+                                      hf_kernels_rotary        12.45%     261.077us        99.68%       2.091ms       2.091ms       0.000us         0.00%      24.830us      24.830us             1  
+                          _rotary_dba7d1e::apply_rotary         2.74%      57.471us         5.07%     106.292us      17.715us      16.960us        71.82%      16.960us       2.827us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us        71.82%      16.960us       2.827us             6  
+                                            aten::clone         2.05%      43.019us        79.24%       1.662ms     276.993us       0.000us         0.00%       7.870us       1.312us             6  
+                                            aten::copy_         2.02%      42.402us        74.32%       1.559ms     259.814us       6.654us        28.18%       7.870us       1.312us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.654us        28.18%       6.654us       1.109us             6  
+                                Activity Buffer Request        68.70%       1.441ms        68.70%       1.441ms       1.441ms       1.216us         5.15%       1.216us       1.216us             1  
+                                    aten::empty_strided         2.86%      60.050us         2.86%      60.050us      10.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         3.60%      75.463us         3.60%      75.463us      12.577us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.30%      48.161us         2.93%      61.552us       5.129us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.64%      13.391us         0.64%      13.391us       1.116us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.33%      48.821us         2.33%      48.821us       8.137us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.32%       6.611us         0.32%       6.611us       6.611us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.045ms
-Self CUDA time total: 23.648us
+Self CPU time total: 2.097ms
+Self CUDA time total: 23.614us
 
 
 
@@ -4015,23 +4015,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     362.684us      1535.76%     362.684us     362.684us             1  
-                                      hf_kernels_rotary         9.63%     184.044us        99.76%       1.906ms       1.906ms       0.000us         0.00%      24.736us      24.736us             1  
-                          _rotary_dba7d1e::apply_rotary         2.64%      50.383us         5.03%      96.065us      16.011us      16.864us        71.41%      16.864us       2.811us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.864us        71.41%      16.864us       2.811us             6  
-                                            aten::clone         1.50%      28.618us        82.74%       1.581ms     263.486us       0.000us         0.00%       7.872us       1.312us             6  
-                                            aten::copy_         1.95%      37.192us        79.54%       1.520ms     253.297us       6.752us        28.59%       7.872us       1.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.752us        28.59%       6.752us       1.125us             6  
-                                Activity Buffer Request        74.55%       1.424ms        74.55%       1.424ms       1.424ms       1.120us         4.74%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.70%      32.513us         1.70%      32.513us       5.419us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         3.05%      58.263us         3.05%      58.263us       9.710us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.86%      35.461us         2.36%      45.051us       3.754us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.50%       9.590us         0.50%       9.590us       0.799us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.39%      45.682us         2.39%      45.682us       7.614us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.600us         0.24%       4.600us       4.600us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     354.397us      1494.59%     354.397us     354.397us             1  
+                                      hf_kernels_rotary         8.83%     167.284us        99.70%       1.889ms       1.889ms       0.000us         0.00%      24.832us      24.832us             1  
+                          _rotary_dba7d1e::apply_rotary         2.30%      43.483us         4.65%      88.083us      14.681us      16.928us        71.39%      16.928us       2.821us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us        71.39%      16.928us       2.821us             6  
+                                            aten::clone         1.15%      21.821us        83.90%       1.589ms     264.887us       0.000us         0.00%       7.904us       1.317us             6  
+                                            aten::copy_         1.92%      36.410us        80.96%       1.534ms     255.610us       6.784us        28.61%       7.904us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us        28.61%       6.784us       1.131us             6  
+                                Activity Buffer Request        76.05%       1.441ms        76.05%       1.441ms       1.441ms       1.120us         4.72%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.79%      33.840us         1.79%      33.840us       5.640us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.99%      56.551us         2.99%      56.551us       9.425us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.80%      34.161us         2.32%      43.981us       3.665us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.52%       9.820us         0.52%       9.820us       0.818us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.35%      44.600us         2.35%      44.600us       7.433us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.30%       5.690us         0.30%       5.690us       5.690us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.911ms
-Self CUDA time total: 23.616us
+Self CPU time total: 1.894ms
+Self CUDA time total: 23.712us
 
 
 
@@ -4041,23 +4041,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.889us      1406.66%     352.889us     352.889us             1  
-                                      hf_kernels_rotary         9.52%     180.074us        99.73%       1.887ms       1.887ms       0.000us         0.00%      26.399us      26.399us             1  
-                          _rotary_dba7d1e::apply_rotary         2.26%      42.841us         4.55%      86.004us      14.334us      17.248us        68.75%      17.248us       2.875us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.248us        68.75%      17.248us       2.875us             6  
-                                            aten::clone         1.50%      28.330us        83.30%       1.576ms     262.706us       0.000us         0.00%       9.151us       1.525us             6  
-                                            aten::copy_         1.91%      36.070us        80.06%       1.515ms     252.487us       7.839us        31.25%       9.151us       1.525us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.839us        31.25%       7.839us       1.307us             6  
-                                Activity Buffer Request        75.19%       1.423ms        75.19%       1.423ms       1.423ms       1.312us         5.23%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.74%      32.981us         1.74%      32.981us       5.497us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.97%      56.174us         2.97%      56.174us       9.362us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.86%      35.224us         2.36%      44.742us       3.729us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.50%       9.518us         0.50%       9.518us       0.793us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.28%      43.163us         2.28%      43.163us       7.194us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.27%       5.081us         0.27%       5.081us       5.081us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     351.514us      1403.08%     351.514us     351.514us             1  
+                                      hf_kernels_rotary         8.97%     168.905us        99.73%       1.878ms       1.878ms       0.000us         0.00%      26.397us      26.397us             1  
+                          _rotary_dba7d1e::apply_rotary         2.31%      43.581us         4.63%      87.301us      14.550us      17.182us        68.58%      17.182us       2.864us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.182us        68.58%      17.182us       2.864us             6  
+                                            aten::clone         1.22%      22.970us        83.89%       1.580ms     263.366us       0.000us         0.00%       9.215us       1.536us             6  
+                                            aten::copy_         1.97%      37.139us        80.89%       1.524ms     253.959us       7.871us        31.42%       9.215us       1.536us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.871us        31.42%       7.871us       1.312us             6  
+                                Activity Buffer Request        76.03%       1.432ms        76.03%       1.432ms       1.432ms       1.344us         5.36%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.78%      33.471us         1.78%      33.471us       5.579us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.90%      54.532us         2.90%      54.532us       9.089us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.75%      33.032us         2.23%      42.062us       3.505us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.030us         0.48%       9.030us       0.753us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.32%      43.720us         2.32%      43.720us       7.287us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.180us         0.27%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.892ms
-Self CUDA time total: 25.087us
+Self CPU time total: 1.884ms
+Self CUDA time total: 25.053us
 
 
 
@@ -4067,22 +4067,22 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     353.892us      1375.46%     353.892us     353.892us             1  
-                                      hf_kernels_rotary         8.61%     178.135us        99.77%       2.063ms       2.063ms       0.000us         0.00%      27.041us      27.041us             1  
-                          _rotary_dba7d1e::apply_rotary         2.02%      41.741us         4.14%      85.532us      14.255us      17.985us        69.90%      17.985us       2.997us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.985us        69.90%      17.985us       2.997us             6  
-                                            aten::clone         1.32%      27.361us        84.83%       1.754ms     292.410us       0.000us         0.00%       9.056us       1.509us             6  
-                                            aten::copy_         1.77%      36.582us        81.87%       1.693ms     282.183us       7.744us        30.10%       9.056us       1.509us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        30.10%       7.744us       1.291us             6  
-                                Activity Buffer Request        68.36%       1.414ms        68.36%       1.414ms       1.414ms       1.312us         5.10%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.64%      34.001us         1.64%      34.001us       5.667us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.73%     242.678us        11.73%     242.678us      40.446us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.70%      35.202us         2.18%      45.153us       3.763us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.48%       9.951us         0.48%       9.951us       0.829us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.12%      43.791us         2.12%      43.791us       7.299us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.830us         0.23%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.719us      1363.13%     350.719us     350.719us             1  
+                                      hf_kernels_rotary         8.14%     166.612us        99.74%       2.042ms       2.042ms       0.000us         0.00%      27.041us      27.041us             1  
+                          _rotary_dba7d1e::apply_rotary         2.16%      44.301us         4.32%      88.521us      14.753us      18.017us        70.03%      18.017us       3.003us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.017us        70.03%      18.017us       3.003us             6  
+                                            aten::clone         1.09%      22.263us        85.15%       1.743ms     290.575us       0.000us         0.00%       9.024us       1.504us             6  
+                                            aten::copy_         1.77%      36.339us        82.49%       1.689ms     281.498us       7.712us        29.97%       9.024us       1.504us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us        29.97%       7.712us       1.285us             6  
+                                Activity Buffer Request        69.05%       1.414ms        69.05%       1.414ms       1.414ms       1.312us         5.10%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.57%      32.200us         1.57%      32.200us       5.367us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        11.67%     238.856us        11.67%     238.856us      39.809us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.67%      34.224us         2.13%      43.713us       3.643us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.46%       9.489us         0.46%       9.489us       0.791us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.16%      44.220us         2.16%      44.220us       7.370us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.270us         0.26%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.068ms
+Self CPU time total: 2.048ms
 Self CUDA time total: 25.729us
 
 
@@ -4093,23 +4093,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     351.422us      1397.30%     351.422us     351.422us             1  
-                                      hf_kernels_rotary         8.84%     180.886us        99.76%       2.041ms       2.041ms       0.000us         0.00%      26.462us      26.462us             1  
-                          _rotary_dba7d1e::apply_rotary         2.10%      42.971us         4.17%      85.245us      14.208us      17.214us        68.45%      17.214us       2.869us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.214us        68.45%      17.214us       2.869us             6  
-                                            aten::clone         1.43%      29.360us        84.55%       1.730ms     288.328us       0.000us         0.00%       9.248us       1.541us             6  
-                                            aten::copy_         1.75%      35.821us        81.51%       1.668ms     277.955us       7.936us        31.55%       9.248us       1.541us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        31.55%       7.936us       1.323us             6  
-                                Activity Buffer Request        69.89%       1.430ms        69.89%       1.430ms       1.430ms       1.312us         5.22%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.61%      32.881us         1.61%      32.881us       5.480us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.87%     201.958us         9.87%     201.958us      33.660us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.76%      36.050us         2.20%      45.010us       3.751us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       8.960us         0.44%       8.960us       0.747us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.07%      42.274us         2.07%      42.274us       7.046us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.920us         0.24%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.430us      1391.53%     350.430us     350.430us             1  
+                                      hf_kernels_rotary         8.29%     166.207us        99.75%       1.999ms       1.999ms       0.000us         0.00%      26.527us      26.527us             1  
+                          _rotary_dba7d1e::apply_rotary         2.23%      44.751us         4.45%      89.251us      14.875us      17.247us        68.49%      17.247us       2.874us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.247us        68.49%      17.247us       2.874us             6  
+                                            aten::clone         1.13%      22.610us        84.88%       1.701ms     283.499us       0.000us         0.00%       9.280us       1.547us             6  
+                                            aten::copy_         1.92%      38.531us        82.21%       1.647ms     274.571us       7.936us        31.51%       9.280us       1.547us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        31.51%       7.936us       1.323us             6  
+                                Activity Buffer Request        70.76%       1.418ms        70.76%       1.418ms       1.418ms       1.344us         5.34%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.54%      30.960us         1.54%      30.960us       5.160us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.53%     190.904us         9.53%     190.904us      31.817us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.66%      33.300us         2.12%      42.550us       3.546us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.46%       9.250us         0.46%       9.250us       0.771us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.22%      44.500us         2.22%      44.500us       7.417us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.25%       4.920us         0.25%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.046ms
-Self CUDA time total: 25.150us
+Self CPU time total: 2.004ms
+Self CUDA time total: 25.183us
 
 
 
@@ -4119,23 +4119,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     347.166us      1351.10%     347.166us     347.166us             1  
-                                      hf_kernels_rotary        21.36%     176.235us        99.42%     820.279us     820.279us       0.000us         0.00%      27.039us      27.039us             1  
-                          _rotary_dba7d1e::apply_rotary         5.20%      42.901us        10.31%      85.044us      14.174us      17.951us        69.86%      17.951us       2.992us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      17.951us        69.86%      17.951us       2.992us             6  
-                                            aten::clone         2.62%      21.601us        62.49%     515.608us      85.935us       0.000us         0.00%       9.088us       1.515us             6  
-                                            aten::copy_         4.36%      35.950us        55.96%     461.697us      76.950us       7.744us        30.14%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        30.14%       7.744us       1.291us             6  
-                                Activity Buffer Request        27.88%     230.028us        27.88%     230.028us     230.028us       1.344us         5.23%       1.344us       1.344us             1  
-                                    aten::empty_strided         3.92%      32.310us         3.92%      32.310us       5.385us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        23.72%     195.719us        23.72%     195.719us      32.620us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.18%      34.481us         5.26%      43.392us       3.616us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.08%       8.911us         1.08%       8.911us       0.743us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.11%      42.143us         5.11%      42.143us       7.024us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.58%       4.821us         0.58%       4.821us       4.821us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.057us      1363.40%     352.057us     352.057us             1  
+                                      hf_kernels_rotary         8.30%     165.062us        99.73%       1.982ms       1.982ms       0.000us         0.00%      27.166us      27.166us             1  
+                          _rotary_dba7d1e::apply_rotary         2.21%      43.981us         4.47%      88.794us      14.799us      18.046us        69.89%      18.046us       3.008us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      18.046us        69.89%      18.046us       3.008us             6  
+                                            aten::clone         1.14%      22.690us        84.76%       1.685ms     280.783us       0.000us         0.00%       9.120us       1.520us             6  
+                                            aten::copy_         1.83%      36.352us        82.00%       1.630ms     271.644us       7.776us        30.11%       9.120us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        30.11%       7.776us       1.296us             6  
+                                Activity Buffer Request        70.82%       1.408ms        70.82%       1.408ms       1.408ms       1.344us         5.20%       1.344us       1.344us             1  
+                                    aten::empty_strided         1.62%      32.140us         1.62%      32.140us       5.357us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.35%     185.845us         9.35%     185.845us      30.974us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.71%      33.929us         2.19%      43.590us       3.632us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%       9.661us         0.49%       9.661us       0.805us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.25%      44.813us         2.25%      44.813us       7.469us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.451us         0.27%       5.451us       5.451us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 825.100us
-Self CUDA time total: 25.695us
+Self CPU time total: 1.988ms
+Self CUDA time total: 25.822us
 
 
 
@@ -4145,23 +4145,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.595us      1078.61%     348.595us     348.595us             1  
-                                      hf_kernels_rotary        21.56%     162.014us        99.35%     746.516us     746.516us       0.000us         0.00%      34.111us      34.111us             1  
-                          _rotary_dba7d1e::apply_rotary         5.56%      41.814us        11.41%      85.705us      14.284us      21.792us        67.43%      21.792us       3.632us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.792us        67.43%      21.792us       3.632us             6  
-                                            aten::clone         2.84%      21.362us        60.59%     455.236us      75.873us       0.000us         0.00%      12.319us       2.053us             6  
-                                            aten::copy_         5.05%      37.942us        53.37%     401.033us      66.839us      10.527us        32.57%      12.319us       2.053us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.527us        32.57%      10.527us       1.755us             6  
-                                Activity Buffer Request        22.09%     165.945us        22.09%     165.945us     165.945us       1.792us         5.54%       1.792us       1.792us             1  
-                                    aten::empty_strided         4.37%      32.841us         4.37%      32.841us       5.474us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        26.24%     197.146us        26.24%     197.146us      32.858us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.61%      34.610us         5.80%      43.561us       3.630us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.19%       8.951us         1.19%       8.951us       0.746us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.84%      43.891us         5.84%      43.891us       7.315us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.65%       4.870us         0.65%       4.870us       4.870us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     358.939us      1106.16%     358.939us     358.939us             1  
+                                      hf_kernels_rotary         8.22%     166.055us        99.77%       2.015ms       2.015ms       0.000us         0.00%      34.209us      34.209us             1  
+                          _rotary_dba7d1e::apply_rotary         2.37%      47.870us         4.45%      89.851us      14.975us      21.952us        67.65%      21.952us       3.659us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.952us        67.65%      21.952us       3.659us             6  
+                                            aten::clone         1.14%      23.001us        84.91%       1.715ms     285.805us       0.000us         0.00%      12.257us       2.043us             6  
+                                            aten::copy_         1.81%      36.630us        82.10%       1.658ms     276.333us      10.497us        32.35%      12.257us       2.043us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.497us        32.35%      10.497us       1.750us             6  
+                                Activity Buffer Request        71.15%       1.437ms        71.15%       1.437ms       1.437ms       1.760us         5.42%       1.760us       1.760us             1  
+                                    aten::empty_strided         1.68%      33.831us         1.68%      33.831us       5.639us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.14%     184.505us         9.14%     184.505us      30.751us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.72%      34.708us         2.18%      44.050us       3.671us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.46%       9.342us         0.46%       9.342us       0.778us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.08%      41.981us         2.08%      41.981us       6.997us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.681us         0.23%       4.681us       4.681us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 751.386us
-Self CUDA time total: 32.319us
+Self CPU time total: 2.019ms
+Self CUDA time total: 32.449us
 
 
 
@@ -4171,23 +4171,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     353.018us       687.35%     353.018us     353.018us             1  
-                                      hf_kernels_rotary        20.18%     167.279us        99.43%     824.358us     824.358us       0.000us         0.00%      54.175us      54.175us             1  
-                          _rotary_dba7d1e::apply_rotary         5.18%      42.971us        10.43%      86.461us      14.410us      34.432us        67.04%      34.432us       5.739us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.432us        67.04%      34.432us       5.739us             6  
-                                            aten::clone         2.72%      22.563us        63.67%     527.908us      87.985us       0.000us         0.00%      19.743us       3.290us             6  
-                                            aten::copy_         4.40%      36.441us        57.12%     473.605us      78.934us      16.927us        32.96%      19.743us       3.290us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.927us        32.96%      16.927us       2.821us             6  
-                                Activity Buffer Request        29.36%     243.449us        29.36%     243.449us     243.449us       2.816us         5.48%       2.816us       2.816us             1  
-                                    aten::empty_strided         3.83%      31.740us         3.83%      31.740us       5.290us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        23.37%     193.715us        23.37%     193.715us      32.286us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.09%      33.928us         5.15%      42.710us       3.559us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.06%       8.782us         1.06%       8.782us       0.732us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.25%      43.490us         5.25%      43.490us       7.248us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.57%       4.720us         0.57%       4.720us       4.720us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     358.237us       692.31%     358.237us     358.237us             1  
+                                      hf_kernels_rotary         8.27%     167.723us        99.73%       2.023ms       2.023ms       0.000us         0.00%      54.593us      54.593us             1  
+                          _rotary_dba7d1e::apply_rotary         2.25%      45.682us         4.44%      90.052us      15.009us      34.785us        67.22%      34.785us       5.798us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.785us        67.22%      34.785us       5.798us             6  
+                                            aten::clone         1.16%      23.462us        84.91%       1.722ms     287.005us       0.000us         0.00%      19.808us       3.301us             6  
+                                            aten::copy_         1.80%      36.481us        82.11%       1.665ms     277.534us      16.960us        32.78%      19.808us       3.301us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us        32.78%      16.960us       2.827us             6  
+                                Activity Buffer Request        71.22%       1.444ms        71.22%       1.444ms       1.444ms       2.848us         5.50%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.65%      33.360us         1.65%      33.360us       5.560us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.09%     184.354us         9.09%     184.354us      30.726us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.63%      33.070us         2.11%      42.771us       3.564us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.48%       9.701us         0.48%       9.701us       0.808us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.19%      44.370us         2.19%      44.370us       7.395us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.380us         0.27%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 829.078us
-Self CUDA time total: 51.359us
+Self CPU time total: 2.028ms
+Self CUDA time total: 51.745us
 
 
 
@@ -4197,23 +4197,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     380.387us      1176.94%     380.387us     380.387us             1  
-                                      hf_kernels_rotary         9.88%     201.876us        99.77%       2.039ms       2.039ms       0.000us         0.00%      34.144us      34.144us             1  
-                          _rotary_dba7d1e::apply_rotary         2.25%      45.971us         4.47%      91.374us      15.229us      21.760us        67.33%      21.760us       3.627us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.760us        67.33%      21.760us       3.627us             6  
-                                            aten::clone         1.35%      27.641us        83.24%       1.701ms     283.513us       0.000us         0.00%      12.384us       2.064us             6  
-                                            aten::copy_         1.82%      37.221us        80.29%       1.641ms     273.476us      10.560us        32.67%      12.384us       2.064us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        32.67%      10.560us       1.760us             6  
-                                Activity Buffer Request        69.28%       1.416ms        69.28%       1.416ms       1.416ms       1.824us         5.64%       1.824us       1.824us             1  
-                                    aten::empty_strided         1.59%      32.582us         1.59%      32.582us       5.430us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.19%     187.866us         9.19%     187.866us      31.311us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.75%      35.720us         2.18%      44.611us       3.718us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       8.891us         0.44%       8.891us       0.741us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.22%      45.403us         2.22%      45.403us       7.567us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.671us         0.23%       4.671us       4.671us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.119us      1074.87%     349.119us     349.119us             1  
+                                      hf_kernels_rotary        19.04%     160.903us        99.46%     840.408us     840.408us       0.000us         0.00%      34.304us      34.304us             1  
+                          _rotary_dba7d1e::apply_rotary         5.13%      43.361us        10.44%      88.182us      14.697us      21.824us        67.19%      21.824us       3.637us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      21.824us        67.19%      21.824us       3.637us             6  
+                                            aten::clone         2.59%      21.862us        64.90%     548.403us      91.400us       0.000us         0.00%      12.480us       2.080us             6  
+                                            aten::copy_         4.23%      35.750us        58.48%     494.121us      82.353us      10.656us        32.81%      12.480us       2.080us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us        32.81%      10.656us       1.776us             6  
+                                Activity Buffer Request        32.24%     272.376us        32.24%     272.376us     272.376us       1.824us         5.62%       1.824us       1.824us             1  
+                                    aten::empty_strided         3.84%      32.420us         3.84%      32.420us       5.403us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.01%     185.995us        22.01%     185.995us      30.999us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.96%      33.479us         5.08%      42.920us       3.577us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.12%       9.441us         1.12%       9.441us       0.787us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.30%      44.821us         5.30%      44.821us       7.470us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.54%       4.540us         0.54%       4.540us       4.540us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.044ms
-Self CUDA time total: 32.320us
+Self CPU time total: 844.948us
+Self CUDA time total: 32.480us
 
 
 
@@ -4223,23 +4223,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     358.145us       697.76%     358.145us     358.145us             1  
-                                      hf_kernels_rotary         9.30%     187.776us        99.78%       2.015ms       2.015ms       0.000us         0.00%      54.208us      54.208us             1  
-                          _rotary_dba7d1e::apply_rotary         2.06%      41.530us         4.25%      85.754us      14.292us      34.401us        67.02%      34.401us       5.734us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.401us        67.02%      34.401us       5.734us             6  
-                                            aten::clone         1.47%      29.652us        84.14%       1.699ms     283.188us       0.000us         0.00%      19.807us       3.301us             6  
-                                            aten::copy_         1.88%      38.042us        81.10%       1.638ms     272.963us      16.927us        32.98%      19.807us       3.301us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.927us        32.98%      16.927us       2.821us             6  
-                                Activity Buffer Request        70.14%       1.416ms        70.14%       1.416ms       1.416ms       2.880us         5.61%       2.880us       2.880us             1  
-                                    aten::empty_strided         1.57%      31.700us         1.57%      31.700us       5.283us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.08%     183.316us         9.08%     183.316us      30.553us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.65%      33.410us         2.09%      42.241us       3.520us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       8.831us         0.44%       8.831us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.19%      44.224us         2.19%      44.224us       7.371us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       4.480us         0.22%       4.480us       4.480us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.887us       674.27%     348.887us     348.887us             1  
+                                      hf_kernels_rotary        19.09%     159.564us        99.38%     830.748us     830.748us       0.000us         0.00%      54.623us      54.623us             1  
+                          _rotary_dba7d1e::apply_rotary         5.35%      44.752us        10.58%      88.432us      14.739us      34.688us        67.04%      34.688us       5.781us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      34.688us        67.04%      34.688us       5.781us             6  
+                                            aten::clone         2.54%      21.199us        64.56%     539.711us      89.952us       0.000us         0.00%      19.935us       3.323us             6  
+                                            aten::copy_         4.41%      36.861us        58.35%     487.801us      81.300us      17.055us        32.96%      19.935us       3.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.055us        32.96%      17.055us       2.843us             6  
+                                Activity Buffer Request        32.25%     269.616us        32.25%     269.616us     269.616us       2.880us         5.57%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.67%      30.711us         3.67%      30.711us       5.119us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.69%     181.324us        21.69%     181.324us      30.221us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.02%      33.622us         5.15%      43.041us       3.587us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.13%       9.419us         1.13%       9.419us       0.785us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.23%      43.680us         5.23%      43.680us       7.280us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.62%       5.190us         0.62%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.019ms
-Self CUDA time total: 51.328us
+Self CPU time total: 835.938us
+Self CUDA time total: 51.743us
 
 
 
@@ -4249,23 +4249,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     361.565us       334.59%     361.565us     361.565us             1  
-                                      hf_kernels_rotary         8.80%     177.873us        99.76%       2.017ms       2.017ms       0.000us         0.00%     126.174us     126.174us             1  
-                                            aten::clone         1.36%      27.530us        84.48%       1.708ms     284.721us       0.000us         0.00%      69.727us      11.621us             6  
-                                            aten::copy_         1.83%      37.081us        81.46%       1.647ms     274.541us      51.615us        47.76%      69.727us      11.621us             6  
-                          _rotary_dba7d1e::apply_rotary         2.15%      43.402us         4.34%      87.665us      14.611us      56.447us        52.24%      56.447us       9.408us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      56.447us        52.24%      56.447us       9.408us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.615us        47.76%      51.615us       8.603us             6  
-                                Activity Buffer Request        70.51%       1.426ms        70.51%       1.426ms       1.426ms      18.112us        16.76%      18.112us      18.112us             1  
-                                    aten::empty_strided         1.66%      33.551us         1.66%      33.551us       5.592us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.12%     184.328us         9.12%     184.328us      30.721us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.73%      34.962us         2.15%      43.472us       3.623us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.42%       8.510us         0.42%       8.510us       0.709us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.19%      44.263us         2.19%      44.263us       7.377us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.810us         0.24%       4.810us       4.810us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     359.992us       330.01%     359.992us     359.992us             1  
+                                      hf_kernels_rotary        18.74%     161.775us        99.39%     857.819us     857.819us       0.000us         0.00%     127.645us     127.645us             1  
+                                            aten::clone         2.55%      22.001us        65.34%     563.953us      93.992us       0.000us         0.00%      70.878us      11.813us             6  
+                                            aten::copy_         4.20%      36.220us        58.95%     508.752us      84.792us      52.319us        47.96%      70.878us      11.813us             6  
+                          _rotary_dba7d1e::apply_rotary         5.00%      43.130us        10.29%      88.850us      14.808us      56.767us        52.04%      56.767us       9.461us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      56.767us        52.04%      56.767us       9.461us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.319us        47.96%      52.319us       8.720us             6  
+                                Activity Buffer Request        31.55%     272.266us        31.55%     272.266us     272.266us      18.559us        17.01%      18.559us      18.559us             1  
+                                    aten::empty_strided         3.85%      33.200us         3.85%      33.200us       5.533us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.20%     200.266us        23.20%     200.266us      33.378us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.92%      33.791us         5.01%      43.241us       3.603us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.09%       9.450us         1.09%       9.450us       0.788us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.30%      45.720us         5.30%      45.720us       7.620us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.61%       5.231us         0.61%       5.231us       5.231us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.022ms
-Self CUDA time total: 108.062us
+Self CPU time total: 863.050us
+Self CUDA time total: 109.086us
 
 
 
@@ -4275,23 +4275,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     374.332us       209.83%     374.332us     374.332us             1  
-                                      hf_kernels_rotary         8.69%     176.335us        99.78%       2.024ms       2.024ms       0.000us         0.00%     202.046us     202.046us             1  
-                                            aten::clone         1.35%      27.382us        84.12%       1.707ms     284.468us       0.000us         0.00%     102.112us      17.019us             6  
-                                            aten::copy_         1.89%      38.342us        81.18%       1.647ms     274.513us      78.464us        43.98%     102.112us      17.019us             6  
-                          _rotary_dba7d1e::apply_rotary         2.26%      45.922us         4.48%      90.874us      15.146us      99.934us        56.02%      99.934us      16.656us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      99.934us        56.02%      99.934us      16.656us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.464us        43.98%      78.464us      13.077us             6  
-                                Activity Buffer Request        70.36%       1.428ms        70.36%       1.428ms       1.428ms      23.648us        13.26%      23.648us      23.648us             1  
-                                    aten::empty_strided         1.59%      32.350us         1.59%      32.350us       5.392us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.93%     181.117us         8.93%     181.117us      30.186us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.68%      34.110us         2.48%      50.391us       4.199us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.80%      16.281us         0.80%      16.281us       1.357us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.22%      44.952us         2.22%      44.952us       7.492us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.22%       4.521us         0.22%       4.521us       4.521us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     362.974us       203.46%     362.974us     362.974us             1  
+                                      hf_kernels_rotary        18.77%     159.855us        99.40%     846.699us     846.699us       0.000us         0.00%     202.112us     202.112us             1  
+                                            aten::clone         2.60%      22.140us        65.34%     556.581us      92.764us       0.000us         0.00%     102.335us      17.056us             6  
+                                            aten::copy_         6.25%      53.212us        58.90%     501.700us      83.617us      78.623us        44.07%     102.335us      17.056us             6  
+                          _rotary_dba7d1e::apply_rotary         5.25%      44.680us        10.37%      88.362us      14.727us      99.777us        55.93%      99.777us      16.630us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      99.777us        55.93%      99.777us      16.630us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      78.623us        44.07%      78.623us      13.104us             6  
+                                Activity Buffer Request        31.35%     267.035us        31.35%     267.035us     267.035us      23.712us        13.29%      23.712us      23.712us             1  
+                                    aten::empty_strided         3.84%      32.741us         3.84%      32.741us       5.457us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.30%     181.453us        21.30%     181.453us      30.242us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.81%      32.449us         4.92%      41.901us       3.492us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.11%       9.452us         1.11%       9.452us       0.788us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.13%      43.682us         5.13%      43.682us       7.280us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.60%       5.070us         0.60%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.029ms
-Self CUDA time total: 178.398us
+Self CPU time total: 851.769us
+Self CUDA time total: 178.400us
 
 
 
@@ -4301,23 +4301,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.716us      1341.48%     350.716us     350.716us             1  
-                                      hf_kernels_rotary         8.88%     178.684us        99.76%       2.007ms       2.007ms       0.000us         0.00%      27.264us      27.264us             1  
-                          _rotary_dba7d1e::apply_rotary         2.16%      43.370us         4.24%      85.224us      14.204us      19.393us        74.18%      19.393us       3.232us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.393us        74.18%      19.393us       3.232us             6  
-                                            aten::clone         1.56%      31.330us        84.58%       1.702ms     283.596us       0.000us         0.00%       7.871us       1.312us             6  
-                                            aten::copy_         1.80%      36.292us        81.38%       1.637ms     272.881us       6.751us        25.82%       7.871us       1.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us        25.82%       6.751us       1.125us             6  
-                                Activity Buffer Request        70.41%       1.417ms        70.41%       1.417ms       1.417ms       1.120us         4.28%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.64%      32.961us         1.64%      32.961us       5.494us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.17%     184.457us         9.17%     184.457us      30.743us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.63%      32.712us         2.06%      41.532us       3.461us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.44%       8.820us         0.44%       8.820us       0.735us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.08%      41.854us         2.08%      41.854us       6.976us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.24%       4.830us         0.24%       4.830us       4.830us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     360.442us      1380.42%     360.442us     360.442us             1  
+                                      hf_kernels_rotary        18.94%     160.272us        99.38%     841.148us     841.148us       0.000us         0.00%      27.231us      27.231us             1  
+                          _rotary_dba7d1e::apply_rotary         6.35%      53.781us        11.54%      97.693us      16.282us      19.328us        74.02%      19.328us       3.221us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.328us        74.02%      19.328us       3.221us             6  
+                                            aten::clone         2.61%      22.063us        63.73%     539.383us      89.897us       0.000us         0.00%       7.903us       1.317us             6  
+                                            aten::copy_         4.50%      38.070us        57.45%     486.201us      81.033us       6.783us        25.98%       7.903us       1.317us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.783us        25.98%       6.783us       1.130us             6  
+                                Activity Buffer Request        31.33%     265.156us        31.33%     265.156us     265.156us       1.120us         4.29%       1.120us       1.120us             1  
+                                    aten::empty_strided         3.68%      31.119us         3.68%      31.119us       5.186us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.62%     182.975us        21.62%     182.975us      30.496us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.07%      34.449us         5.18%      43.800us       3.650us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.10%       9.351us         1.10%       9.351us       0.779us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.19%      43.912us         5.19%      43.912us       7.319us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.62%       5.210us         0.62%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.012ms
-Self CUDA time total: 26.144us
+Self CPU time total: 846.358us
+Self CUDA time total: 26.111us
 
 
 
@@ -4327,22 +4327,22 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     377.307us      1377.44%     377.307us     377.307us             1  
-                                      hf_kernels_rotary        21.29%     163.294us        99.28%     761.426us     761.426us       0.000us         0.00%      28.704us      28.704us             1  
-                          _rotary_dba7d1e::apply_rotary         5.68%      43.540us        11.49%      88.163us      14.694us      19.584us        71.50%      19.584us       3.264us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.584us        71.50%      19.584us       3.264us             6  
-                                            aten::clone         3.08%      23.620us        60.95%     467.436us      77.906us       0.000us         0.00%       9.120us       1.520us             6  
-                                            aten::copy_         5.00%      38.311us        53.59%     411.005us      68.501us       7.808us        28.50%       9.120us       1.520us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us        28.50%       7.808us       1.301us             6  
-                                Activity Buffer Request        21.08%     161.645us        21.08%     161.645us     161.645us       1.312us         4.79%       1.312us       1.312us             1  
-                                    aten::empty_strided         4.28%      32.811us         4.28%      32.811us       5.468us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        27.52%     211.049us        27.52%     211.049us      35.175us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.33%      33.234us         5.55%      42.533us       3.544us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.21%       9.299us         1.21%       9.299us       0.775us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.82%      44.623us         5.82%      44.623us       7.437us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.72%       5.550us         0.72%       5.550us       5.550us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.461us      1264.83%     346.461us     346.461us             1  
+                                      hf_kernels_rotary        20.09%     160.625us        99.34%     794.228us     794.228us       0.000us         0.00%      28.704us      28.704us             1  
+                          _rotary_dba7d1e::apply_rotary         5.81%      46.461us        11.03%      88.152us      14.692us      19.583us        71.49%      19.583us       3.264us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      19.583us        71.49%      19.583us       3.264us             6  
+                                            aten::clone         2.69%      21.471us        62.97%     503.431us      83.905us       0.000us         0.00%       9.121us       1.520us             6  
+                                            aten::copy_         4.41%      35.231us        56.20%     449.330us      74.888us       7.809us        28.51%       9.121us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.809us        28.51%       7.809us       1.301us             6  
+                                Activity Buffer Request        29.01%     231.915us        29.01%     231.915us     231.915us       1.312us         4.79%       1.312us       1.312us             1  
+                                    aten::empty_strided         4.08%      32.630us         4.08%      32.630us       5.438us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        22.79%     182.184us        22.79%     182.184us      30.364us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.13%      33.049us         5.26%      42.020us       3.502us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.12%       8.971us         1.12%       8.971us       0.748us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.21%      41.691us         5.21%      41.691us       6.949us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.66%       5.270us         0.66%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 766.976us
+Self CPU time total: 799.498us
 Self CUDA time total: 27.392us
 
 
@@ -4353,23 +4353,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.153us      1234.28%     349.153us     349.153us             1  
-                                      hf_kernels_rotary        19.50%     158.266us        99.38%     806.788us     806.788us       0.000us         0.00%      29.600us      29.600us             1  
-                          _rotary_dba7d1e::apply_rotary         5.36%      43.530us        10.78%      87.514us      14.586us      20.544us        72.62%      20.544us       3.424us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.544us        72.62%      20.544us       3.424us             6  
-                                            aten::clone         2.63%      21.380us        63.75%     517.547us      86.258us       0.000us         0.00%       9.056us       1.509us             6  
-                                            aten::copy_         4.60%      37.352us        57.23%     464.607us      77.434us       7.744us        27.38%       9.056us       1.509us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us        27.38%       7.744us       1.291us             6  
-                                Activity Buffer Request        29.79%     241.838us        29.79%     241.838us     241.838us       1.312us         4.64%       1.312us       1.312us             1  
-                                    aten::empty_strided         3.89%      31.560us         3.89%      31.560us       5.260us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.84%     185.417us        22.84%     185.417us      30.903us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.24%      34.459us         5.35%      43.461us       3.622us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.11%       9.002us         1.11%       9.002us       0.750us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.42%      43.984us         5.42%      43.984us       7.331us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.62%       5.020us         0.62%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     349.309us      1232.09%     349.309us     349.309us             1  
+                                      hf_kernels_rotary         8.10%     161.823us        99.77%       1.992ms       1.992ms       0.000us         0.00%      29.663us      29.663us             1  
+                          _rotary_dba7d1e::apply_rotary         2.22%      44.350us         4.33%      86.563us      14.427us      20.575us        72.57%      20.575us       3.429us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.575us        72.57%      20.575us       3.429us             6  
+                                            aten::clone         1.14%      22.731us        85.22%       1.702ms     283.649us       0.000us         0.00%       9.088us       1.515us             6  
+                                            aten::copy_         1.87%      37.281us        82.45%       1.647ms     274.431us       7.776us        27.43%       9.088us       1.515us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        27.43%       7.776us       1.296us             6  
+                                Activity Buffer Request        71.45%       1.427ms        71.45%       1.427ms       1.427ms       1.312us         4.63%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.63%      32.581us         1.63%      32.581us       5.430us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.14%     182.513us         9.14%     182.513us      30.419us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.66%      33.090us         2.10%      42.030us       3.503us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.45%       8.940us         0.45%       8.940us       0.745us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.11%      42.213us         2.11%      42.213us       7.035us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.23%       4.691us         0.23%       4.691us       4.691us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 811.808us
-Self CUDA time total: 28.288us
+Self CPU time total: 1.997ms
+Self CUDA time total: 28.351us
 
 
 
@@ -4379,23 +4379,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     348.027us       976.29%     348.027us     348.027us             1  
-                                      hf_kernels_rotary        20.53%     156.455us        99.34%     757.166us     757.166us       0.000us         0.00%      37.440us      37.440us             1  
-                          _rotary_dba7d1e::apply_rotary         5.63%      42.881us        11.27%      85.894us      14.316us      25.184us        70.65%      25.184us       4.197us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.184us        70.65%      25.184us       4.197us             6  
-                                            aten::clone         3.00%      22.853us        61.65%     469.877us      78.313us       0.000us         0.00%      12.256us       2.043us             6  
-                                            aten::copy_         4.74%      36.121us        54.50%     415.394us      69.232us      10.464us        29.35%      12.256us       2.043us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.464us        29.35%      10.464us       1.744us             6  
-                                Activity Buffer Request        25.88%     197.217us        25.88%     197.217us     197.217us       1.792us         5.03%       1.792us       1.792us             1  
-                                    aten::empty_strided         4.15%      31.630us         4.15%      31.630us       5.272us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        23.89%     182.056us        23.89%     182.056us      30.343us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.53%      34.528us         5.90%      44.940us       3.745us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.37%      10.412us         1.37%      10.412us       0.868us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.64%      43.013us         5.64%      43.013us       7.169us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.66%       5.020us         0.66%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.717us       965.68%     346.717us     346.717us             1  
+                                      hf_kernels_rotary         8.08%     159.825us        99.74%       1.973ms       1.973ms       0.000us         0.00%      37.696us      37.696us             1  
+                          _rotary_dba7d1e::apply_rotary         2.36%      46.731us         4.45%      88.052us      14.675us      25.344us        70.59%      25.344us       4.224us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.344us        70.59%      25.344us       4.224us             6  
+                                            aten::clone         1.15%      22.657us        85.03%       1.682ms     280.364us       0.000us         0.00%      12.352us       2.059us             6  
+                                            aten::copy_         1.85%      36.611us        82.30%       1.628ms     271.377us      10.560us        29.41%      12.352us       2.059us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us        29.41%      10.560us       1.760us             6  
+                                Activity Buffer Request        71.35%       1.412ms        71.35%       1.412ms       1.412ms       1.792us         4.99%       1.792us       1.792us             1  
+                                    aten::empty_strided         1.58%      31.262us         1.58%      31.262us       5.210us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.10%     180.014us         9.10%     180.014us      30.002us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.71%      33.773us         2.18%      43.112us       3.593us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.47%       9.339us         0.47%       9.339us       0.778us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.09%      41.321us         2.09%      41.321us       6.887us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.26%       5.170us         0.26%       5.170us       5.170us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 762.186us
-Self CUDA time total: 35.648us
+Self CPU time total: 1.978ms
+Self CUDA time total: 35.904us
 
 
 
@@ -4405,23 +4405,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     346.012us      1220.37%     346.012us     346.012us             1  
-                                      hf_kernels_rotary        19.32%     159.865us        99.40%     822.269us     822.269us       0.000us         0.00%      29.665us      29.665us             1  
-                          _rotary_dba7d1e::apply_rotary         5.23%      43.230us        10.32%      85.383us      14.231us      20.577us        72.57%      20.577us       3.429us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.577us        72.57%      20.577us       3.429us             6  
-                                            aten::clone         2.67%      22.091us        64.52%     533.759us      88.960us       0.000us         0.00%       9.088us       1.515us             6  
-                                            aten::copy_         4.35%      36.002us        57.93%     479.208us      79.868us       7.776us        27.43%       9.088us       1.515us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us        27.43%       7.776us       1.296us             6  
-                                Activity Buffer Request        31.47%     260.369us        31.47%     260.369us     260.369us       1.312us         4.63%       1.312us       1.312us             1  
-                                    aten::empty_strided         3.92%      32.460us         3.92%      32.460us       5.410us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.10%     182.837us        22.10%     182.837us      30.473us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.12%      34.091us         5.23%      43.262us       3.605us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.11%       9.171us         1.11%       9.171us       0.764us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.10%      42.153us         5.10%      42.153us       7.026us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.60%       4.990us         0.60%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     379.517us      1337.13%     379.517us     379.517us             1  
+                                      hf_kernels_rotary         9.04%     183.063us        99.73%       2.019ms       2.019ms       0.000us         0.00%      29.695us      29.695us             1  
+                          _rotary_dba7d1e::apply_rotary         2.30%      46.590us         4.55%      92.183us      15.364us      20.640us        72.72%      20.640us       3.440us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      20.640us        72.72%      20.640us       3.440us             6  
+                                            aten::clone         1.11%      22.532us        83.96%       1.699ms     283.211us       0.000us         0.00%       9.055us       1.509us             6  
+                                            aten::copy_         1.86%      37.591us        81.15%       1.642ms     273.739us       7.743us        27.28%       9.055us       1.509us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us        27.28%       7.743us       1.290us             6  
+                                Activity Buffer Request        70.05%       1.418ms        70.05%       1.418ms       1.418ms       1.312us         4.62%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.69%      34.300us         1.69%      34.300us       5.717us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.24%     187.074us         9.24%     187.074us      31.179us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.69%      34.120us         2.18%      44.030us       3.669us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.49%       9.910us         0.49%       9.910us       0.826us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         2.25%      45.593us         2.25%      45.593us       7.599us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.27%       5.440us         0.27%       5.440us       5.440us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 827.259us
-Self CUDA time total: 28.353us
+Self CPU time total: 2.024ms
+Self CUDA time total: 28.383us
 
 
 
@@ -4431,23 +4431,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     379.611us      1063.04%     379.611us     379.611us             1  
-                                      hf_kernels_rotary        17.54%     182.966us        99.53%       1.038ms       1.038ms       0.000us         0.00%      37.470us      37.470us             1  
-                          _rotary_dba7d1e::apply_rotary         4.31%      44.959us         8.52%      88.913us      14.819us      25.247us        70.70%      25.247us       4.208us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.247us        70.70%      25.247us       4.208us             6  
-                                            aten::clone         2.14%      22.291us        69.13%     721.275us     120.212us       0.000us         0.00%      12.223us       2.037us             6  
-                                            aten::copy_         3.58%      37.312us        63.91%     666.784us     111.131us      10.463us        29.30%      12.223us       2.037us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.463us        29.30%      10.463us       1.744us             6  
-                                Activity Buffer Request        42.63%     444.746us        42.63%     444.746us     444.746us       1.760us         4.93%       1.760us       1.760us             1  
-                                    aten::empty_strided         3.09%      32.200us         3.09%      32.200us       5.367us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.71%     184.726us        17.71%     184.726us      30.788us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.45%      36.000us         4.33%      45.221us       3.768us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.88%       9.221us         0.88%       9.221us       0.768us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.21%      43.954us         4.21%      43.954us       7.326us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.47%       4.940us         0.47%       4.940us       4.940us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     343.101us       955.55%     343.101us     343.101us             1  
+                                      hf_kernels_rotary        21.17%     156.032us        99.29%     731.736us     731.736us       0.000us         0.00%      37.666us      37.666us             1  
+                          _rotary_dba7d1e::apply_rotary         6.10%      44.981us        11.95%      88.102us      14.684us      25.410us        70.77%      25.410us       4.235us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      25.410us        70.77%      25.410us       4.235us             6  
+                                            aten::clone         2.78%      20.462us        60.43%     445.371us      74.229us       0.000us         0.00%      12.256us       2.043us             6  
+                                            aten::copy_         4.78%      35.221us        53.31%     392.899us      65.483us      10.496us        29.23%      12.256us       2.043us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us        29.23%      10.496us       1.749us             6  
+                                Activity Buffer Request        23.63%     174.154us        23.63%     174.154us     174.154us       1.760us         4.90%       1.760us       1.760us             1  
+                                    aten::empty_strided         4.34%      32.010us         4.34%      32.010us       5.335us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        24.90%     183.524us        24.90%     183.524us      30.587us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.46%      32.889us         5.73%      42.231us       3.519us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.27%       9.342us         1.27%       9.342us       0.779us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.85%      43.121us         5.85%      43.121us       7.187us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.71%       5.231us         0.71%       5.231us       5.231us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.043ms
-Self CUDA time total: 35.710us
+Self CPU time total: 736.967us
+Self CUDA time total: 35.906us
 
 
 
@@ -4457,23 +4457,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.330us       621.69%     350.330us     350.330us             1  
-                                      hf_kernels_rotary        20.69%     166.654us        99.40%     800.657us     800.657us       0.000us         0.00%      59.231us      59.231us             1  
-                          _rotary_dba7d1e::apply_rotary         5.43%      43.738us        10.71%      86.292us      14.382us      39.327us        69.79%      39.327us       6.554us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.327us        69.79%      39.327us       6.554us             6  
-                                            aten::clone         2.60%      20.920us        62.50%     503.467us      83.911us       0.000us         0.00%      19.904us       3.317us             6  
-                                            aten::copy_         4.42%      35.631us        55.79%     449.427us      74.904us      17.024us        30.21%      19.904us       3.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us        30.21%      17.024us       2.837us             6  
-                                Activity Buffer Request        28.71%     231.299us        28.71%     231.299us     231.299us       2.880us         5.11%       2.880us       2.880us             1  
-                                    aten::empty_strided         4.11%      33.120us         4.11%      33.120us       5.520us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.66%     182.497us        22.66%     182.497us      30.416us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.34%      34.964us         5.49%      44.244us       3.687us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.15%       9.280us         1.15%       9.280us       0.773us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.28%      42.554us         5.28%      42.554us       7.092us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.60%       4.850us         0.60%       4.850us       4.850us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     352.508us       623.07%     352.508us     352.508us             1  
+                                      hf_kernels_rotary        16.04%     161.055us        99.46%     998.502us     998.502us       0.000us         0.00%      59.488us      59.488us             1  
+                          _rotary_dba7d1e::apply_rotary         4.50%      45.140us         8.76%      87.902us      14.650us      39.520us        69.85%      39.520us       6.587us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.520us        69.85%      39.520us       6.587us             6  
+                                            aten::clone         2.15%      21.591us        70.51%     707.875us     117.979us       0.000us         0.00%      19.968us       3.328us             6  
+                                            aten::copy_         3.70%      37.171us        65.07%     653.264us     108.877us      17.056us        30.15%      19.968us       3.328us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us        30.15%      17.056us       2.843us             6  
+                                Activity Buffer Request        41.99%     421.539us        41.99%     421.539us     421.539us       2.912us         5.15%       2.912us       2.912us             1  
+                                    aten::empty_strided         3.29%      33.020us         3.29%      33.020us       5.503us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        19.38%     194.554us        19.38%     194.554us      32.426us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.26%      32.759us         4.15%      41.670us       3.472us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.89%       8.911us         0.89%       8.911us       0.743us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         4.26%      42.762us         4.26%      42.762us       7.127us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.54%       5.441us         0.54%       5.441us       5.441us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 805.507us
-Self CUDA time total: 56.351us
+Self CPU time total: 1.004ms
+Self CUDA time total: 56.576us
 
 
 
@@ -4483,23 +4483,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     363.291us       308.26%     363.291us     363.291us             1  
-                                      hf_kernels_rotary        19.60%     166.384us        99.43%     844.179us     844.179us       0.000us         0.00%     134.846us     134.846us             1  
-                                            aten::clone         2.55%      21.670us        64.54%     547.969us      91.328us       0.000us         0.00%      70.143us      11.691us             6  
-                                            aten::copy_         4.54%      38.561us        58.31%     495.019us      82.503us      53.151us        45.10%      70.143us      11.691us             6  
-                          _rotary_dba7d1e::apply_rotary         4.97%      42.172us        10.27%      87.155us      14.526us      64.703us        54.90%      64.703us      10.784us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      64.703us        54.90%      64.703us      10.784us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.151us        45.10%      53.151us       8.859us             6  
-                                Activity Buffer Request        32.22%     273.530us        32.22%     273.530us     273.530us      16.992us        14.42%      16.992us      16.992us             1  
-                                    aten::empty_strided         3.68%      31.280us         3.68%      31.280us       5.213us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        21.55%     182.928us        21.55%     182.928us      30.488us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.96%      33.580us         5.03%      42.671us       3.556us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.07%       9.091us         1.07%       9.091us       0.758us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         5.30%      44.983us         5.30%      44.983us       7.497us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.57%       4.820us         0.57%       4.820us       4.820us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     387.642us       328.65%     387.642us     387.642us             1  
+                                      hf_kernels_rotary        18.94%     166.375us        99.43%     873.639us     873.639us       0.000us         0.00%     134.717us     134.717us             1  
+                                            aten::clone         2.49%      21.882us        65.15%     572.462us      95.410us       0.000us         0.00%      69.630us      11.605us             6  
+                                            aten::copy_         4.33%      38.039us        58.86%     517.140us      86.190us      52.863us        44.82%      69.630us      11.605us             6  
+                          _rotary_dba7d1e::apply_rotary         5.22%      45.870us        10.42%      91.531us      15.255us      65.087us        55.18%      65.087us      10.848us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.087us        55.18%      65.087us      10.848us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.863us        44.82%      52.863us       8.810us             6  
+                                Activity Buffer Request        30.86%     271.146us        30.86%     271.146us     271.146us      16.767us        14.22%      16.767us      16.767us             1  
+                                    aten::empty_strided         3.81%      33.440us         3.81%      33.440us       5.573us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        23.67%     207.955us        23.67%     207.955us      34.659us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.81%      33.492us         4.92%      43.271us       3.606us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.11%       9.779us         1.11%       9.779us       0.815us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.20%      45.661us         5.20%      45.661us       7.610us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.57%       5.000us         0.57%       5.000us       5.000us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 848.999us
-Self CUDA time total: 117.854us
+Self CPU time total: 878.639us
+Self CUDA time total: 117.950us
 
 
 
@@ -4509,23 +4509,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     370.462us       657.41%     370.462us     370.462us             1  
-                                      hf_kernels_rotary         9.39%     189.846us        99.77%       2.018ms       2.018ms       0.000us         0.00%      59.200us      59.200us             1  
-                          _rotary_dba7d1e::apply_rotary         2.15%      43.502us         4.33%      87.525us      14.588us      39.360us        69.85%      39.360us       6.560us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.360us        69.85%      39.360us       6.560us             6  
-                                            aten::clone         1.41%      28.463us        83.80%       1.695ms     282.475us       0.000us         0.00%      19.840us       3.307us             6  
-                                            aten::copy_         1.87%      37.890us        80.77%       1.634ms     272.251us      16.992us        30.15%      19.840us       3.307us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us        30.15%      16.992us       2.832us             6  
-                                Activity Buffer Request        69.77%       1.411ms        69.77%       1.411ms       1.411ms       2.848us         5.05%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.63%      32.881us         1.63%      32.881us       5.480us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.13%     184.676us         9.13%     184.676us      30.779us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.76%      35.550us         2.25%      45.480us       3.790us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.49%       9.930us         0.49%       9.930us       0.827us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.18%      44.023us         2.18%      44.023us       7.337us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.23%       4.690us         0.23%       4.690us       4.690us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     350.008us       617.28%     350.008us     350.008us             1  
+                                      hf_kernels_rotary        19.15%     160.263us        99.38%     831.878us     831.878us       0.000us         0.00%      59.582us      59.582us             1  
+                          _rotary_dba7d1e::apply_rotary         5.48%      45.879us        10.51%      87.950us      14.658us      39.551us        69.75%      39.551us       6.592us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      39.551us        69.75%      39.551us       6.592us             6  
+                                            aten::clone         2.51%      21.009us        64.67%     541.343us      90.224us       0.000us         0.00%      20.031us       3.338us             6  
+                                            aten::copy_         4.43%      37.081us        58.27%     487.761us      81.294us      17.151us        30.25%      20.031us       3.338us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.151us        30.25%      17.151us       2.858us             6  
+                                Activity Buffer Request        32.40%     271.246us        32.40%     271.246us     271.246us       2.880us         5.08%       2.880us       2.880us             1  
+                                    aten::empty_strided         3.89%      32.573us         3.89%      32.573us       5.429us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.44%     179.434us        21.44%     179.434us      29.906us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.94%      32.963us         5.06%      42.322us       3.527us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.12%       9.359us         1.12%       9.359us       0.780us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.03%      42.071us         5.03%      42.071us       7.012us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.62%       5.200us         0.62%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.022ms
-Self CUDA time total: 56.352us
+Self CPU time total: 837.078us
+Self CUDA time total: 56.702us
 
 
 
@@ -4535,23 +4535,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     359.680us       306.26%     359.680us     359.680us             1  
-                                      hf_kernels_rotary         9.06%     182.622us        99.75%       2.011ms       2.011ms       0.000us         0.00%     134.753us     134.753us             1  
-                                            aten::clone         1.36%      27.350us        84.30%       1.700ms     283.278us       0.000us         0.00%      70.114us      11.686us             6  
-                                            aten::copy_         1.85%      37.232us        81.34%       1.640ms     273.341us      52.802us        44.96%      70.114us      11.686us             6  
-                          _rotary_dba7d1e::apply_rotary         2.09%      42.192us         4.26%      85.926us      14.321us      64.639us        55.04%      64.639us      10.773us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      64.639us        55.04%      64.639us      10.773us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.802us        44.96%      52.802us       8.800us             6  
-                                Activity Buffer Request        70.45%       1.420ms        70.45%       1.420ms       1.420ms      17.312us        14.74%      17.312us      17.312us             1  
-                                    aten::empty_strided         1.60%      32.271us         1.60%      32.271us       5.379us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.05%     182.507us         9.05%     182.507us      30.418us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         1.67%      33.712us         2.12%      42.832us       3.569us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.45%       9.120us         0.45%       9.120us       0.760us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         2.17%      43.734us         2.17%      43.734us       7.289us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.25%       5.130us         0.25%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     353.498us       298.65%     353.498us     353.498us             1  
+                                      hf_kernels_rotary        18.88%     158.066us        99.36%     831.868us     831.868us       0.000us         0.00%     136.124us     136.124us             1  
+                                            aten::clone         2.60%      21.739us        64.69%     541.581us      90.263us       0.000us         0.00%      70.911us      11.819us             6  
+                                            aten::copy_         4.33%      36.250us        58.43%     489.151us      81.525us      53.151us        44.90%      70.911us      11.819us             6  
+                          _rotary_dba7d1e::apply_rotary         5.47%      45.759us        10.77%      90.181us      15.030us      65.213us        55.10%      65.213us      10.869us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us      65.213us        55.10%      65.213us      10.869us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.151us        44.90%      53.151us       8.859us             6  
+                                Activity Buffer Request        32.62%     273.096us        32.62%     273.096us     273.096us      17.760us        15.00%      17.760us      17.760us             1  
+                                    aten::empty_strided         3.67%      30.691us         3.67%      30.691us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.48%     179.805us        21.48%     179.805us      29.968us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.93%      32.900us         5.02%      42.040us       3.503us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.09%       9.140us         1.09%       9.140us       0.762us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.31%      44.422us         5.31%      44.422us       7.404us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.64%       5.350us         0.64%       5.350us       5.350us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.016ms
-Self CUDA time total: 117.441us
+Self CPU time total: 837.218us
+Self CUDA time total: 118.364us
 
 
 
@@ -4561,23 +4561,23 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     363.604us       186.68%     363.604us     363.604us             1  
-                                      hf_kernels_rotary        18.95%     159.454us        99.42%     836.628us     836.628us       0.000us         0.00%     218.425us     218.425us             1  
-                          _rotary_dba7d1e::apply_rotary         5.11%      42.982us        10.01%      84.264us      14.044us     114.460us        58.76%     114.460us      19.077us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     114.460us        58.76%     114.460us      19.077us             6  
-                                            aten::clone         2.64%      22.190us        65.28%     549.368us      91.561us       0.000us         0.00%     103.965us      17.328us             6  
-                                            aten::copy_         4.30%      36.168us        58.92%     495.836us      82.639us      80.318us        41.24%     103.965us      17.328us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      80.318us        41.24%      80.318us      13.386us             6  
-                                Activity Buffer Request        32.31%     271.900us        32.31%     271.900us     271.900us      23.647us        12.14%      23.647us      23.647us             1  
-                                    aten::empty_strided         3.72%      31.342us         3.72%      31.342us       5.224us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        22.31%     187.768us        22.31%     187.768us      31.295us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.01%      33.772us         5.17%      43.542us       3.628us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         1.16%       9.770us         1.16%       9.770us       0.814us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         4.91%      41.282us         4.91%      41.282us       6.880us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize         0.58%       4.880us         0.58%       4.880us       4.880us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     360.988us       184.39%     360.988us     360.988us             1  
+                                      hf_kernels_rotary        18.98%     158.685us        99.38%     830.678us     830.678us       0.000us         0.00%     219.452us     219.452us             1  
+                          _rotary_dba7d1e::apply_rotary         5.52%      46.111us        10.69%      89.382us      14.897us     114.877us        58.68%     114.877us      19.146us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     114.877us        58.68%     114.877us      19.146us             6  
+                                            aten::clone         2.66%      22.261us        64.58%     539.851us      89.975us       0.000us         0.00%     104.575us      17.429us             6  
+                                            aten::copy_         4.21%      35.161us        58.26%     487.000us      81.167us      80.895us        41.32%     104.575us      17.429us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      80.895us        41.32%      80.895us      13.482us             6  
+                                Activity Buffer Request        32.35%     270.376us        32.35%     270.376us     270.376us      23.680us        12.10%      23.680us      23.680us             1  
+                                    aten::empty_strided         3.66%      30.590us         3.66%      30.590us       5.098us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        21.71%     181.463us        21.71%     181.463us      30.244us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         3.99%      33.350us         5.12%      42.760us       3.563us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         1.13%       9.410us         1.13%       9.410us       0.784us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         5.18%      43.271us         5.18%      43.271us       7.212us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize         0.62%       5.200us         0.62%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 841.508us
-Self CUDA time total: 194.778us
+Self CPU time total: 835.878us
+Self CUDA time total: 195.772us
 
 
 
@@ -4587,27 +4587,27 @@ PROFILE TRACE: hf_kernels_rotary | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                      hf_kernels_rotary        13.69%     161.817us        65.35%     772.637us     772.637us       0.000us         0.00%     853.016us     853.016us             1  
-                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     794.618us       101.00%     794.618us     794.618us             1  
-                                            aten::clone         1.91%      22.540us        40.85%     482.956us      80.493us       0.000us         0.00%     580.923us      96.820us             6  
-                                            aten::copy_         3.05%      36.119us        36.34%     429.636us      71.606us     514.652us        65.42%     580.923us      96.820us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     514.652us        65.42%     514.652us      85.775us             6  
-                          _rotary_dba7d1e::apply_rotary         3.53%      41.772us         7.15%      84.524us      14.087us     272.093us        34.58%     272.093us      45.349us             6  
-void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     272.093us        34.58%     272.093us      45.349us             6  
-                                Activity Buffer Request        17.75%     209.918us        17.75%     209.918us     209.918us      66.271us         8.42%      66.271us      66.271us             1  
-                                    aten::empty_strided         2.60%      30.780us         2.60%      30.780us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.53%     183.599us        15.53%     183.599us      30.600us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.92%      34.511us         3.67%      43.340us       3.612us       0.000us         0.00%       0.000us       0.000us            12  
-                                       aten::as_strided         0.75%       8.829us         0.75%       8.829us       0.736us       0.000us         0.00%       0.000us       0.000us            12  
-                                       cudaLaunchKernel         3.62%      42.752us         3.62%      42.752us       7.125us       0.000us         0.00%       0.000us       0.000us             6  
-                                  cudaDeviceSynchronize        34.65%     409.744us        34.65%     409.744us     409.744us       0.000us         0.00%       0.000us       0.000us             1  
+                                      hf_kernels_rotary        12.86%     162.524us        68.27%     862.699us     862.699us       0.000us         0.00%     851.436us     851.436us             1  
+                                      hf_kernels_rotary         0.00%       0.000us         0.00%       0.000us       0.000us     792.815us       101.00%     792.815us     792.815us             1  
+                                            aten::clone         1.75%      22.062us        44.83%     566.473us      94.412us       0.000us         0.00%     579.314us      96.552us             6  
+                                            aten::copy_         2.96%      37.431us        40.52%     512.021us      85.337us     512.820us        65.33%     579.314us      96.552us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     512.820us        65.33%     512.820us      85.470us             6  
+                          _rotary_dba7d1e::apply_rotary         3.66%      46.190us         7.09%      89.631us      14.938us     272.122us        34.67%     272.122us      45.354us             6  
+void at::native::(anonymous namespace)::unrolled_ele...         0.00%       0.000us         0.00%       0.000us       0.000us     272.122us        34.67%     272.122us      45.354us             6  
+                                Activity Buffer Request        21.88%     276.466us        21.88%     276.466us     276.466us      66.494us         8.47%      66.494us      66.494us             1  
+                                    aten::empty_strided         2.56%      32.390us         2.56%      32.390us       5.398us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        15.68%     198.124us        15.68%     198.124us      33.021us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.73%      34.460us         3.49%      44.071us       3.673us       0.000us         0.00%       0.000us       0.000us            12  
+                                       aten::as_strided         0.76%       9.611us         0.76%       9.611us       0.801us       0.000us         0.00%       0.000us       0.000us            12  
+                                       cudaLaunchKernel         3.44%      43.441us         3.44%      43.441us       7.240us       0.000us         0.00%       0.000us       0.000us             6  
+                                  cudaDeviceSynchronize        31.73%     401.009us        31.73%     401.009us     401.009us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.182ms
-Self CUDA time total: 786.745us
+Self CPU time total: 1.264ms
+Self CUDA time total: 784.942us
 
 
 impl                     wl                  p50(ms)  ok
-hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.09  False
+hf_kernels_rotary        cuda_B1_S128_H32_D128_R64     0.10  False
 hf_kernels_rotary        cuda_B1_S128_H32_D64_R32     0.10  False
 hf_kernels_rotary        cuda_B1_S128_H8_D128_R64     0.10  False
 hf_kernels_rotary        cuda_B1_S128_H8_D64_R32     0.08  False
@@ -4618,7 +4618,7 @@ hf_kernels_rotary        cuda_B1_S2048_H8_D64_R32     0.09  False
 hf_kernels_rotary        cuda_B1_S512_H32_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B1_S512_H32_D64_R32     0.09  False
 hf_kernels_rotary        cuda_B1_S512_H8_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.09  False
+hf_kernels_rotary        cuda_B1_S512_H8_D64_R32     0.10  False
 hf_kernels_rotary        cuda_B2_S128_H32_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B2_S128_H32_D64_R32     0.09  False
 hf_kernels_rotary        cuda_B2_S128_H8_D128_R64     0.09  False
@@ -4626,21 +4626,21 @@ hf_kernels_rotary        cuda_B2_S128_H8_D64_R32     0.09  False
 hf_kernels_rotary        cuda_B2_S2048_H32_D128_R64     0.28  False
 hf_kernels_rotary        cuda_B2_S2048_H32_D64_R32     0.10  False
 hf_kernels_rotary        cuda_B2_S2048_H8_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.09  False
-hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.09  False
-hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B2_S2048_H8_D64_R32     0.10  False
+hf_kernels_rotary        cuda_B2_S512_H32_D128_R64     0.10  False
+hf_kernels_rotary        cuda_B2_S512_H32_D64_R32     0.09  False
 hf_kernels_rotary        cuda_B2_S512_H8_D128_R64     0.09  False
 hf_kernels_rotary        cuda_B2_S512_H8_D64_R32     0.09  False
 
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s] -Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s]
+Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.77it/s] +Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14.75it/s]

Artifacts:

rotary.jsonl