Spaces:
No application file
No application file
Upload folder using huggingface_hub
Browse files- activation/impls/artifacts/benchmark/activation.jsonl +9 -9
- activation/impls/cells/benchmark.py +7 -13
- activation/impls/hf_kernels_swiglu.html +144 -96
- activation/impls/torch_swiglu.html +122 -128
- activation/results/artifacts/combine/latency.svg +38 -38
- activation/results/combined_results.html +79 -79
- causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
- causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
- causal_conv1d/impls/torch_causal_conv1d.html +0 -0
- causal_conv1d/results/artifacts/combine/latency.svg +64 -64
- causal_conv1d/results/combined_results.html +142 -142
- flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
- flash_attn/impls/cells/benchmark.py +8 -9
- flash_attn/impls/flash_attention.html +137 -137
- flash_attn/impls/hf_kernels_flash_attn.html +92 -92
- flash_attn/impls/hf_kernels_flash_attn3.html +84 -89
- flash_attn/impls/mem_efficient_attention.html +133 -185
- flash_attn/impls/sage_attention.html +17 -12
- flash_attn/impls/xformers.html +91 -91
- flash_attn/results/artifacts/combine/latency.svg +55 -55
- flash_attn/results/combined_results.html +141 -141
- layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
- layer_norm/impls/cells/benchmark.py +5 -28
- layer_norm/impls/hf_kernels_layer_norm.html +54 -55
- layer_norm/impls/torch_layer_norm.html +54 -60
- layer_norm/results/artifacts/combine/latency.svg +24 -24
- layer_norm/results/combined_results.html +53 -53
- rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
- rotary/impls/cells/benchmark.py +21 -11
- rotary/impls/hf_kernels_rotary.html +0 -0
- rotary/impls/torch_rotary.html +0 -0
- rotary/results/artifacts/combine/latency.svg +39 -39
- rotary/results/combined_results.html +106 -106
activation/impls/artifacts/benchmark/activation.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.039330999982212234, "p50": 0.04005099998494188, "p90": 0.04157099999702041, "mean": 0.040440999998736515, "iqr": 0.0020999999605919584, "raw_times": [0.03947100003642845, 0.04157099999702041, 0.04005099998494188, 0.041780999993079604, 0.039330999982212234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.047832000007019815, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0515919999770631, "p50": 0.05179099997576486, "p90": 0.05224099999168175, "mean": 0.05211119997738933, "iqr": 0.0006300000450210064, "raw_times": [0.05224099999168175, 0.0515919999770631, 0.05161099994666074, 0.05179099997576486, 0.05332099999577622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0555309999867859, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04996100000198567, "p50": 0.05194099998107049, "p90": 0.05195099998900332, "mean": 0.05124099998283782, "iqr": 0.0016000000186977559, "raw_times": [0.05194099998107049, 0.05195099998900332, 0.04996100000198567, 0.05200099997182406, 0.050350999970305566], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05537100003039086, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04920100002436811, "p50": 0.05169100001012339, "p90": 0.05200099997182406, "mean": 0.051318999987870484, "iqr": 0.000339999985499162, "raw_times": [0.051660999986324896, 0.05204099994671196, 0.04920100002436811, 0.05200099997182406, 0.05169100001012339], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055880999980217894, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04919000002701068, "p50": 0.05105200000343757, "p90": 0.05142099996646721, "mean": 0.050994999992326484, "iqr": 0.0005200000146032835, "raw_times": [0.05090099995186392, 0.05241100001285304, 0.05142099996646721, 0.04919000002701068, 0.05105200000343757], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054681999984040885, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04663100003199361, "p50": 0.05066099998884965, "p90": 0.05077099996242396, "mean": 0.049591000004056696, "iqr": 0.0016599999526079046, "raw_times": [0.04911100000981605, 0.05077099996242396, 0.04663100003199361, 0.05078100002720021, 0.05066099998884965], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05457200001046658, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04728099997919344, "p50": 0.050772000008691975, "p90": 0.051271000018005, "mean": 0.04967720000195186, "iqr": 0.003820000017640268, "raw_times": [0.04728099997919344, 0.051271000018005, 0.05161100000350416, 0.050772000008691975, 0.04745100000036473], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05381099998658101, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04900099997939833, "p50": 0.04957199996624695, "p90": 0.05115100003649786, "mean": 0.05033119999779956, "iqr": 0.001620000034563418, "raw_times": [0.04900099997939833, 0.052401000004920206, 0.04957199996624695, 0.05115100003649786, 0.049531000001934444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05343100002619394, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04688100000294071, "p50": 0.04992099997025434, "p90": 0.05054100000734252, "mean": 0.049500799991619715, "iqr": 0.0023510000346504967, "raw_times": [0.04688100000294071, 0.04992099997025434, 0.05054100000734252, 0.04818999997269202, 0.051971000004868984], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05505100000391394, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
|
activation/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,22 +12,17 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
|
| 17 |
|
| 18 |
-
# Load the activation kernel
|
| 19 |
-
activation = get_kernel("kernels-community/activation")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
out_shape = input_tensor.shape[:-1] + (hidden_dim,)
|
| 25 |
-
out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
|
| 26 |
-
return activation.silu_and_mul(out, input_tensor)
|
| 27 |
|
| 28 |
|
| 29 |
run_benchmark(
|
| 30 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 31 |
-
impl_name="
|
| 32 |
-
impl_tags={"family":
|
| 33 |
-
impl_func=
|
| 34 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 15 |
+
import torch, torch.nn.functional as F
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
def swiglu_eager(x):
|
| 19 |
+
d = x.shape[-1] // 2
|
| 20 |
+
return F.silu(x[..., :d]) * x[..., d:]
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
run_benchmark(
|
| 24 |
kernel_type=KernelTypeEnum.ACTIVATION,
|
| 25 |
+
impl_name="torch_eager",
|
| 26 |
+
impl_tags={"family":"hf-kernels", "backend":"eager"},
|
| 27 |
+
impl_func=swiglu_eager,
|
| 28 |
)
|
activation/impls/hf_kernels_swiglu.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark |
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3976,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3980 |
-
hf_kernels_swiglu
|
| 3981 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 3982 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3983 |
-
Activity Buffer Request 82.
|
| 3984 |
-
aten::empty 2.
|
| 3985 |
-
cudaLaunchKernel 2.
|
| 3986 |
-
cudaDeviceSynchronize 0.
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
-
Self CPU time total: 1.
|
| 3989 |
-
Self CUDA time total: 4.
|
| 3990 |
|
| 3991 |
|
| 3992 |
|
|
@@ -3996,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
|
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4000 |
-
hf_kernels_swiglu
|
| 4001 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4002 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4003 |
-
Activity Buffer Request
|
| 4004 |
-
aten::empty 1.
|
| 4005 |
-
cudaLaunchKernel 1.
|
| 4006 |
-
cudaDeviceSynchronize 0.
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total: 1.
|
| 4009 |
-
Self CUDA time total: 3.
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
@@ -4016,17 +4016,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4020 |
-
hf_kernels_swiglu
|
| 4021 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4022 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4023 |
-
Activity Buffer Request 89.
|
| 4024 |
-
aten::empty 1.
|
| 4025 |
-
cudaLaunchKernel 1.
|
| 4026 |
-
cudaDeviceSynchronize 0.
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total: 1.
|
| 4029 |
-
Self CUDA time total: 4.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,17 +4036,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4040 |
-
hf_kernels_swiglu 5.
|
| 4041 |
-
_activation_beeaae6::silu_and_mul 1.
|
| 4042 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4043 |
-
Activity Buffer Request
|
| 4044 |
-
aten::empty 1.
|
| 4045 |
-
cudaLaunchKernel
|
| 4046 |
-
cudaDeviceSynchronize 0.
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
-
Self CPU time total: 1.
|
| 4049 |
-
Self CUDA time total: 4.
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
@@ -4056,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4060 |
-
hf_kernels_swiglu
|
| 4061 |
-
_activation_beeaae6::silu_and_mul
|
| 4062 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4063 |
-
Activity Buffer Request
|
| 4064 |
-
aten::empty
|
| 4065 |
-
cudaLaunchKernel
|
| 4066 |
-
cudaDeviceSynchronize
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
-
Self CPU time total:
|
| 4069 |
-
Self CUDA time total: 5.
|
| 4070 |
|
| 4071 |
|
| 4072 |
|
|
@@ -4076,17 +4076,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4080 |
-
hf_kernels_swiglu
|
| 4081 |
-
_activation_beeaae6::silu_and_mul
|
| 4082 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4083 |
-
Activity Buffer Request
|
| 4084 |
-
aten::empty
|
| 4085 |
-
cudaLaunchKernel
|
| 4086 |
-
cudaDeviceSynchronize
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
Self CPU time total:
|
| 4089 |
-
Self CUDA time total: 7.
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
@@ -4096,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4100 |
-
hf_kernels_swiglu
|
| 4101 |
-
_activation_beeaae6::silu_and_mul
|
| 4102 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4103 |
-
Activity Buffer Request
|
| 4104 |
-
aten::empty
|
| 4105 |
-
cudaLaunchKernel
|
| 4106 |
-
cudaDeviceSynchronize
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
-
Self CPU time total:
|
| 4109 |
-
Self CUDA time total: 6.
|
| 4110 |
|
| 4111 |
|
| 4112 |
|
|
@@ -4116,16 +4116,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4120 |
-
hf_kernels_swiglu
|
| 4121 |
-
_activation_beeaae6::silu_and_mul
|
| 4122 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3
|
| 4123 |
-
Activity Buffer Request
|
| 4124 |
-
aten::empty
|
| 4125 |
-
cudaLaunchKernel
|
| 4126 |
-
cudaDeviceSynchronize
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
-
Self CPU time total:
|
| 4129 |
Self CUDA time total: 9.440us
|
| 4130 |
|
| 4131 |
|
|
@@ -4136,23 +4136,23 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
|
|
| 4136 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4137 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
-
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4140 |
-
hf_kernels_swiglu
|
| 4141 |
-
_activation_beeaae6::silu_and_mul
|
| 4142 |
-
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.
|
| 4143 |
-
Activity Buffer Request
|
| 4144 |
-
aten::empty
|
| 4145 |
-
cudaLaunchKernel
|
| 4146 |
-
cudaDeviceSynchronize
|
| 4147 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4148 |
-
Self CPU time total:
|
| 4149 |
-
Self CUDA time total: 13.
|
| 4150 |
|
| 4151 |
|
| 4152 |
impl wl p50(ms) ok
|
| 4153 |
hf_kernels_swiglu cuda_T128_D1024 0.03 True
|
| 4154 |
hf_kernels_swiglu cuda_T128_D2048 0.03 True
|
| 4155 |
-
hf_kernels_swiglu cuda_T128_D768 0.
|
| 4156 |
hf_kernels_swiglu cuda_T256_D1024 0.03 True
|
| 4157 |
hf_kernels_swiglu cuda_T256_D2048 0.03 True
|
| 4158 |
hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
@@ -4163,12 +4163,60 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
|
|
| 4163 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4164 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4165 |
<div class="uv-logs-content" style="display: none;">
|
| 4166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4167 |
</div>
|
| 4168 |
</div>
|
| 4169 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4170 |
-
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 12.
|
| 4171 |
-
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 17.
|
| 4172 |
<div class="cell-artifacts">
|
| 4173 |
<h4>Artifacts:</h4>
|
| 4174 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.28s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 04:12:56 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 27C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 32.53s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 84.479us 2079.23% 84.479us 84.479us 1
|
| 3980 |
+
hf_kernels_swiglu 10.30% 179.633us 99.61% 1.737ms 1.737ms 0.000us 0.00% 5.471us 5.471us 1
|
| 3981 |
+
_activation_beeaae6::silu_and_mul 1.22% 21.351us 86.54% 1.509ms 502.938us 4.063us 100.00% 5.471us 1.824us 3
|
| 3982 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
|
| 3983 |
+
Activity Buffer Request 82.53% 1.439ms 82.53% 1.439ms 1.439ms 1.408us 34.65% 1.408us 1.408us 1
|
| 3984 |
+
aten::empty 2.76% 48.131us 2.76% 48.131us 16.044us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
cudaLaunchKernel 2.78% 48.541us 2.78% 48.541us 16.180us 0.000us 0.00% 0.000us 0.000us 3
|
| 3986 |
+
cudaDeviceSynchronize 0.39% 6.861us 0.39% 6.861us 6.861us 0.000us 0.00% 0.000us 0.000us 1
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
+
Self CPU time total: 1.743ms
|
| 3989 |
+
Self CUDA time total: 4.063us
|
| 3990 |
|
| 3991 |
|
| 3992 |
|
|
|
|
| 3996 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3997 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.383us 1622.96% 64.383us 64.383us 1
|
| 4000 |
+
hf_kernels_swiglu 5.77% 91.273us 99.69% 1.576ms 1.576ms 0.000us 0.00% 5.311us 5.311us 1
|
| 4001 |
+
_activation_beeaae6::silu_and_mul 1.42% 22.508us 92.74% 1.466ms 488.714us 3.967us 100.00% 5.311us 1.770us 3
|
| 4002 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
|
| 4003 |
+
Activity Buffer Request 89.71% 1.418ms 89.71% 1.418ms 1.418ms 1.344us 33.88% 1.344us 1.344us 1
|
| 4004 |
+
aten::empty 1.18% 18.580us 1.18% 18.580us 6.193us 0.000us 0.00% 0.000us 0.000us 3
|
| 4005 |
+
cudaLaunchKernel 1.61% 25.442us 1.61% 25.442us 8.481us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaDeviceSynchronize 0.31% 4.900us 0.31% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Self CPU time total: 1.581ms
|
| 4009 |
+
Self CUDA time total: 3.967us
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.375us 1326.60% 65.375us 65.375us 1
|
| 4020 |
+
hf_kernels_swiglu 5.63% 88.392us 99.68% 1.565ms 1.565ms 0.000us 0.00% 6.592us 6.592us 1
|
| 4021 |
+
_activation_beeaae6::silu_and_mul 1.42% 22.341us 92.82% 1.457ms 485.598us 4.928us 100.00% 6.592us 2.197us 3
|
| 4022 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3
|
| 4023 |
+
Activity Buffer Request 89.75% 1.409ms 89.75% 1.409ms 1.409ms 1.664us 33.77% 1.664us 1.664us 1
|
| 4024 |
+
aten::empty 1.23% 19.370us 1.23% 19.370us 6.457us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaLaunchKernel 1.64% 25.701us 1.64% 25.701us 8.567us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaDeviceSynchronize 0.32% 5.010us 0.32% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 1.570ms
|
| 4029 |
+
Self CUDA time total: 4.928us
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.864us 1618.05% 68.864us 68.864us 1
|
| 4040 |
+
hf_kernels_swiglu 5.06% 90.612us 99.72% 1.787ms 1.787ms 0.000us 0.00% 5.696us 5.696us 1
|
| 4041 |
+
_activation_beeaae6::silu_and_mul 1.27% 22.842us 93.53% 1.676ms 558.683us 4.256us 100.00% 5.696us 1.899us 3
|
| 4042 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
|
| 4043 |
+
Activity Buffer Request 78.82% 1.412ms 78.82% 1.412ms 1.412ms 1.440us 33.83% 1.440us 1.440us 1
|
| 4044 |
+
aten::empty 1.13% 20.320us 1.13% 20.320us 6.773us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
cudaLaunchKernel 13.43% 240.735us 13.43% 240.735us 80.245us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaDeviceSynchronize 0.28% 5.081us 0.28% 5.081us 5.081us 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 1.792ms
|
| 4049 |
+
Self CUDA time total: 4.256us
|
| 4050 |
|
| 4051 |
|
| 4052 |
|
|
|
|
| 4056 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4057 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.014us 1176.71% 70.014us 70.014us 1
|
| 4060 |
+
hf_kernels_swiglu 5.43% 92.861us 99.73% 1.704ms 1.704ms 0.000us 0.00% 7.933us 7.933us 1
|
| 4061 |
+
_activation_beeaae6::silu_and_mul 1.32% 22.490us 93.06% 1.590ms 530.025us 5.950us 100.00% 7.933us 2.644us 3
|
| 4062 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.950us 100.00% 5.950us 1.983us 3
|
| 4063 |
+
Activity Buffer Request 82.71% 1.413ms 82.71% 1.413ms 1.413ms 1.983us 33.33% 1.983us 1.983us 1
|
| 4064 |
+
aten::empty 1.24% 21.111us 1.24% 21.111us 7.037us 0.000us 0.00% 0.000us 0.000us 3
|
| 4065 |
+
cudaLaunchKernel 9.03% 154.323us 9.03% 154.323us 51.441us 0.000us 0.00% 0.000us 0.000us 3
|
| 4066 |
+
cudaDeviceSynchronize 0.27% 4.600us 0.27% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
|
| 4067 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4068 |
+
Self CPU time total: 1.709ms
|
| 4069 |
+
Self CUDA time total: 5.950us
|
| 4070 |
|
| 4071 |
|
| 4072 |
|
|
|
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4078 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4079 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 71.711us 918.31% 71.711us 71.711us 1
|
| 4080 |
+
hf_kernels_swiglu 20.20% 91.983us 98.97% 450.570us 450.570us 0.000us 0.00% 10.402us 10.402us 1
|
| 4081 |
+
_activation_beeaae6::silu_and_mul 4.90% 22.310us 74.58% 339.547us 113.182us 7.809us 100.00% 10.402us 3.467us 3
|
| 4082 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.809us 100.00% 7.809us 2.603us 3
|
| 4083 |
+
Activity Buffer Request 36.02% 164.004us 36.02% 164.004us 164.004us 2.593us 33.21% 2.593us 2.593us 1
|
| 4084 |
+
aten::empty 4.18% 19.040us 4.18% 19.040us 6.347us 0.000us 0.00% 0.000us 0.000us 3
|
| 4085 |
+
cudaLaunchKernel 33.66% 153.233us 33.66% 153.233us 51.078us 0.000us 0.00% 0.000us 0.000us 3
|
| 4086 |
+
cudaDeviceSynchronize 1.03% 4.690us 1.03% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
Self CPU time total: 455.260us
|
| 4089 |
+
Self CUDA time total: 7.809us
|
| 4090 |
|
| 4091 |
|
| 4092 |
|
|
|
|
| 4096 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4097 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4098 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4099 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.446us 968.24% 64.446us 64.446us 1
|
| 4100 |
+
hf_kernels_swiglu 19.89% 86.491us 98.92% 430.210us 430.210us 0.000us 0.00% 8.897us 8.897us 1
|
| 4101 |
+
_activation_beeaae6::silu_and_mul 5.08% 22.091us 74.70% 324.868us 108.289us 6.656us 100.00% 8.897us 2.966us 3
|
| 4102 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 100.00% 6.656us 2.219us 3
|
| 4103 |
+
Activity Buffer Request 34.88% 151.694us 34.88% 151.694us 151.694us 2.241us 33.67% 2.241us 2.241us 1
|
| 4104 |
+
aten::empty 4.33% 18.851us 4.33% 18.851us 6.284us 0.000us 0.00% 0.000us 0.000us 3
|
| 4105 |
+
cudaLaunchKernel 34.74% 151.083us 34.74% 151.083us 50.361us 0.000us 0.00% 0.000us 0.000us 3
|
| 4106 |
+
cudaDeviceSynchronize 1.08% 4.700us 1.08% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
|
| 4107 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4108 |
+
Self CPU time total: 434.910us
|
| 4109 |
+
Self CUDA time total: 6.656us
|
| 4110 |
|
| 4111 |
|
| 4112 |
|
|
|
|
| 4116 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4117 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4118 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4119 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.471us 735.92% 69.471us 69.471us 1
|
| 4120 |
+
hf_kernels_swiglu 5.54% 94.743us 99.69% 1.705ms 1.705ms 0.000us 0.00% 12.608us 12.608us 1
|
| 4121 |
+
_activation_beeaae6::silu_and_mul 1.25% 21.451us 93.03% 1.592ms 530.512us 9.440us 100.00% 12.608us 4.203us 3
|
| 4122 |
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3
|
| 4123 |
+
Activity Buffer Request 82.96% 1.419ms 82.96% 1.419ms 1.419ms 3.168us 33.56% 3.168us 3.168us 1
|
| 4124 |
+
aten::empty 1.12% 19.220us 1.12% 19.220us 6.407us 0.000us 0.00% 0.000us 0.000us 3
|
| 4125 |
+
cudaLaunchKernel 8.81% 150.793us 8.81% 150.793us 50.264us 0.000us 0.00% 0.000us 0.000us 3
|
| 4126 |
+
cudaDeviceSynchronize 0.31% 5.230us 0.31% 5.230us 5.230us 0.000us 0.00% 0.000us 0.000us 1
|
| 4127 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4128 |
+
Self CPU time total: 1.711ms
|
| 4129 |
Self CUDA time total: 9.440us
|
| 4130 |
|
| 4131 |
|
|
|
|
| 4136 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4137 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4138 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4139 |
+
hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.606us 520.41% 68.606us 68.606us 1
|
| 4140 |
+
hf_kernels_swiglu 20.98% 86.561us 98.91% 408.129us 408.129us 0.000us 0.00% 17.599us 17.599us 1
|
| 4141 |
+
_activation_beeaae6::silu_and_mul 5.52% 22.769us 73.39% 302.816us 100.939us 13.183us 100.00% 17.599us 5.866us 3
|
| 4142 |
+
void vllm::act_and_mul_kernel<c10::BFloat16, &(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.183us 100.00% 13.183us 4.394us 3
|
| 4143 |
+
Activity Buffer Request 29.84% 123.113us 29.84% 123.113us 123.113us 4.416us 33.50% 4.416us 4.416us 1
|
| 4144 |
+
aten::empty 4.54% 18.752us 4.54% 18.752us 6.251us 0.000us 0.00% 0.000us 0.000us 3
|
| 4145 |
+
cudaLaunchKernel 38.03% 156.934us 38.03% 156.934us 52.311us 0.000us 0.00% 0.000us 0.000us 3
|
| 4146 |
+
cudaDeviceSynchronize 1.09% 4.500us 1.09% 4.500us 4.500us 0.000us 0.00% 0.000us 0.000us 1
|
| 4147 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4148 |
+
Self CPU time total: 412.629us
|
| 4149 |
+
Self CUDA time total: 13.183us
|
| 4150 |
|
| 4151 |
|
| 4152 |
impl wl p50(ms) ok
|
| 4153 |
hf_kernels_swiglu cuda_T128_D1024 0.03 True
|
| 4154 |
hf_kernels_swiglu cuda_T128_D2048 0.03 True
|
| 4155 |
+
hf_kernels_swiglu cuda_T128_D768 0.03 True
|
| 4156 |
hf_kernels_swiglu cuda_T256_D1024 0.03 True
|
| 4157 |
hf_kernels_swiglu cuda_T256_D2048 0.03 True
|
| 4158 |
hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
|
|
| 4163 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4164 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4165 |
<div class="uv-logs-content" style="display: none;">
|
| 4166 |
+
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4167 |
+
Downloading sympy (6.0MiB)
|
| 4168 |
+
Downloading networkx (1.9MiB)
|
| 4169 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4170 |
+
Downloading numpy (16.2MiB)
|
| 4171 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4172 |
+
Downloading setuptools (1.1MiB)
|
| 4173 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4174 |
+
Downloading kiwisolver (1.4MiB)
|
| 4175 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4176 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4177 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4178 |
+
Downloading torch (846.9MiB)
|
| 4179 |
+
Downloading hf-xet (3.2MiB)
|
| 4180 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4181 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4182 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4183 |
+
Downloading fonttools (4.7MiB)
|
| 4184 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4185 |
+
Downloading triton (148.3MiB)
|
| 4186 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4187 |
+
Downloading pillow (6.7MiB)
|
| 4188 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4189 |
+
Downloading matplotlib (8.3MiB)
|
| 4190 |
+
Downloading nvidia-cufile-cu12
|
| 4191 |
+
Downloading kiwisolver
|
| 4192 |
+
Downloading hf-xet
|
| 4193 |
+
Downloading setuptools
|
| 4194 |
+
Downloading fonttools
|
| 4195 |
+
Downloading networkx
|
| 4196 |
+
Downloading pillow
|
| 4197 |
+
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4198 |
+
Downloading matplotlib
|
| 4199 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 4200 |
+
Downloading numpy
|
| 4201 |
+
Downloading sympy
|
| 4202 |
+
Downloading nvidia-nvjitlink-cu12
|
| 4203 |
+
Downloading nvidia-curand-cu12
|
| 4204 |
+
Downloading nvidia-cuda-nvrtc-cu12
|
| 4205 |
+
Downloading triton
|
| 4206 |
+
Downloading nvidia-cufft-cu12
|
| 4207 |
+
Downloading nvidia-cusolver-cu12
|
| 4208 |
+
Downloading nvidia-cusparse-cu12
|
| 4209 |
+
Downloading nvidia-cusparselt-cu12
|
| 4210 |
+
Downloading nvidia-nccl-cu12
|
| 4211 |
+
Downloading nvidia-cublas-cu12
|
| 4212 |
+
Downloading nvidia-cudnn-cu12
|
| 4213 |
+
Downloading torch
|
| 4214 |
+
Installed 52 packages in 206ms
|
| 4215 |
</div>
|
| 4216 |
</div>
|
| 4217 |
<div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]
|
| 4218 |
+
Fetching 7 files: 71%|███████▏ | 5/7 [00:00<00:00, 12.63it/s]
|
| 4219 |
+
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 17.67it/s]</div>
|
| 4220 |
<div class="cell-artifacts">
|
| 4221 |
<h4>Artifacts:</h4>
|
| 4222 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/impls/torch_swiglu.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3918,9 +3918,9 @@ Cell: nv | 0.21s
|
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
-
<span id="uv-indicator-benchmark"
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark |
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
|
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 208.
|
| 3974 |
-
torch_eager 11.
|
| 3975 |
-
aten::silu 3.
|
| 3976 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3977 |
-
aten::mul
|
| 3978 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3979 |
-
Activity Buffer Request
|
| 3980 |
-
aten::slice 2.
|
| 3981 |
-
aten::as_strided 0.
|
| 3982 |
-
cudaLaunchKernel 3.
|
| 3983 |
-
cudaDeviceSynchronize 0.
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
-
Self CPU time total: 1.
|
| 3986 |
-
Self CUDA time total: 12.
|
| 3987 |
|
| 3988 |
|
| 3989 |
|
|
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
|
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3997 |
-
torch_eager 7.
|
| 3998 |
-
aten::silu 2.
|
| 3999 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4000 |
-
aten::mul
|
| 4001 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4002 |
-
Activity Buffer Request
|
| 4003 |
-
aten::slice 1.
|
| 4004 |
-
aten::as_strided 0.36% 6.
|
| 4005 |
-
cudaLaunchKernel 2.
|
| 4006 |
-
cudaDeviceSynchronize 0.
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
-
Self CPU time total: 1.
|
| 4009 |
-
Self CUDA time total: 12.
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4020 |
-
torch_eager 7.
|
| 4021 |
-
aten::silu 2.
|
| 4022 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4023 |
-
aten::mul 1.
|
| 4024 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4025 |
-
Activity Buffer Request 83.
|
| 4026 |
-
aten::slice 1.
|
| 4027 |
-
aten::as_strided 0.
|
| 4028 |
-
cudaLaunchKernel 2.
|
| 4029 |
-
cudaDeviceSynchronize 0.
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
-
Self CPU time total: 1.
|
| 4032 |
-
Self CUDA time total: 13.
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4043 |
-
torch_eager
|
| 4044 |
-
aten::silu
|
| 4045 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4046 |
-
aten::mul 1.
|
| 4047 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.
|
| 4048 |
-
Activity Buffer Request
|
| 4049 |
-
aten::slice 1.
|
| 4050 |
-
aten::as_strided 0.
|
| 4051 |
-
cudaLaunchKernel
|
| 4052 |
-
cudaDeviceSynchronize 0.
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
-
Self CPU time total: 1.
|
| 4055 |
-
Self CUDA time total: 12.
|
| 4056 |
|
| 4057 |
|
| 4058 |
|
|
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
|
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4066 |
-
torch_eager 6.
|
| 4067 |
-
aten::silu 2.
|
| 4068 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4069 |
-
aten::mul 1.
|
| 4070 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4071 |
-
Activity Buffer Request
|
| 4072 |
-
aten::slice 1.36%
|
| 4073 |
-
aten::as_strided 0.
|
| 4074 |
-
cudaLaunchKernel
|
| 4075 |
-
cudaDeviceSynchronize 0.
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
-
Self CPU time total: 1.
|
| 4078 |
-
Self CUDA time total: 13.
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4089 |
-
torch_eager
|
| 4090 |
-
aten::silu
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4092 |
-
aten::mul
|
| 4093 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4094 |
-
Activity Buffer Request
|
| 4095 |
-
aten::slice
|
| 4096 |
-
aten::as_strided 0.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaDeviceSynchronize 0.
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
-
Self CPU time total:
|
| 4101 |
-
Self CUDA time total: 15.
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4112 |
-
torch_eager
|
| 4113 |
-
aten::silu 2.
|
| 4114 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4115 |
-
aten::mul 1.
|
| 4116 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4117 |
-
Activity Buffer Request 76.
|
| 4118 |
-
aten::slice 1.
|
| 4119 |
-
aten::as_strided 0.
|
| 4120 |
-
cudaLaunchKernel
|
| 4121 |
-
cudaDeviceSynchronize 0.
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
-
Self CPU time total: 1.
|
| 4124 |
-
Self CUDA time total: 14.
|
| 4125 |
|
| 4126 |
|
| 4127 |
|
|
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
|
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4133 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4134 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4135 |
-
torch_eager
|
| 4136 |
-
aten::silu
|
| 4137 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4138 |
-
aten::mul
|
| 4139 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.
|
| 4140 |
-
Activity Buffer Request
|
| 4141 |
-
aten::slice
|
| 4142 |
-
aten::as_strided 0.
|
| 4143 |
-
cudaLaunchKernel
|
| 4144 |
-
cudaDeviceSynchronize 0.
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
-
Self CPU time total:
|
| 4147 |
-
Self CUDA time total: 15.
|
| 4148 |
|
| 4149 |
|
| 4150 |
|
|
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
|
|
| 4154 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4155 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4156 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4157 |
-
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4158 |
-
torch_eager
|
| 4159 |
-
aten::silu 2.
|
| 4160 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.
|
| 4161 |
-
aten::mul 1.
|
| 4162 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4163 |
-
Activity Buffer Request
|
| 4164 |
-
aten::slice 1.
|
| 4165 |
-
aten::as_strided 0.33%
|
| 4166 |
-
cudaLaunchKernel
|
| 4167 |
-
cudaDeviceSynchronize 0.
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
-
Self CPU time total: 1.
|
| 4170 |
-
Self CUDA time total: 22.
|
| 4171 |
|
| 4172 |
|
| 4173 |
impl wl p50(ms) ok
|
|
@@ -4181,12 +4181,6 @@ torch_eager cuda_T512_D1024 0.05 True
|
|
| 4181 |
torch_eager cuda_T512_D2048 0.05 True
|
| 4182 |
torch_eager cuda_T512_D768 0.05 True
|
| 4183 |
</pre></div>
|
| 4184 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4185 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4186 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4187 |
-
Installed 37 packages in 235ms
|
| 4188 |
-
</div>
|
| 4189 |
-
</div>
|
| 4190 |
<div class="cell-artifacts">
|
| 4191 |
<h4>Artifacts:</h4>
|
| 4192 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.28s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 04:12:56 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 27C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 3.39s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 208.672us 1646.72% 208.672us 208.672us 1
|
| 3974 |
+
torch_eager 11.52% 217.973us 99.62% 1.885ms 1.885ms 0.000us 0.00% 14.976us 14.976us 1
|
| 3975 |
+
aten::silu 3.07% 58.081us 81.78% 1.547ms 515.694us 6.464us 51.01% 8.768us 2.923us 3
|
| 3976 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 51.01% 6.464us 2.155us 3
|
| 3977 |
+
aten::mul 1.91% 36.092us 3.28% 62.082us 20.694us 6.208us 48.99% 6.208us 2.069us 3
|
| 3978 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.99% 6.208us 2.069us 3
|
| 3979 |
+
Activity Buffer Request 76.33% 1.444ms 76.33% 1.444ms 1.444ms 2.304us 18.18% 2.304us 2.304us 1
|
| 3980 |
+
aten::slice 2.46% 46.622us 3.04% 57.552us 9.592us 0.000us 0.00% 0.000us 0.000us 6
|
| 3981 |
+
aten::as_strided 0.58% 10.930us 0.58% 10.930us 1.822us 0.000us 0.00% 0.000us 0.000us 6
|
| 3982 |
+
cudaLaunchKernel 3.75% 71.021us 3.75% 71.021us 11.837us 0.000us 0.00% 0.000us 0.000us 6
|
| 3983 |
+
cudaDeviceSynchronize 0.38% 7.160us 0.38% 7.160us 7.160us 0.000us 0.00% 0.000us 0.000us 1
|
| 3984 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3985 |
+
Self CPU time total: 1.892ms
|
| 3986 |
+
Self CUDA time total: 12.672us
|
| 3987 |
|
| 3988 |
|
| 3989 |
|
|
|
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 189.724us 1532.13% 189.724us 189.724us 1
|
| 3997 |
+
torch_eager 7.75% 136.545us 99.70% 1.756ms 1.756ms 0.000us 0.00% 14.559us 14.559us 1
|
| 3998 |
+
aten::silu 2.47% 43.560us 85.85% 1.512ms 503.984us 6.399us 51.68% 8.575us 2.858us 3
|
| 3999 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
|
| 4000 |
+
aten::mul 2.87% 50.460us 4.18% 73.560us 24.520us 5.984us 48.32% 5.984us 1.995us 3
|
| 4001 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
|
| 4002 |
+
Activity Buffer Request 81.76% 1.440ms 81.76% 1.440ms 1.440ms 2.176us 17.57% 2.176us 2.176us 1
|
| 4003 |
+
aten::slice 1.56% 27.471us 1.92% 33.791us 5.632us 0.000us 0.00% 0.000us 0.000us 6
|
| 4004 |
+
aten::as_strided 0.36% 6.320us 0.36% 6.320us 1.053us 0.000us 0.00% 0.000us 0.000us 6
|
| 4005 |
+
cudaLaunchKernel 2.93% 51.591us 2.93% 51.591us 8.598us 0.000us 0.00% 0.000us 0.000us 6
|
| 4006 |
+
cudaDeviceSynchronize 0.30% 5.280us 0.30% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1
|
| 4007 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4008 |
+
Self CPU time total: 1.761ms
|
| 4009 |
+
Self CUDA time total: 12.383us
|
| 4010 |
|
| 4011 |
|
| 4012 |
|
|
|
|
| 4016 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4017 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4018 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4019 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.261us 1202.24% 159.261us 159.261us 1
|
| 4020 |
+
torch_eager 7.59% 133.144us 99.70% 1.749ms 1.749ms 0.000us 0.00% 15.487us 15.487us 1
|
| 4021 |
+
aten::silu 2.45% 42.980us 87.40% 1.533ms 511.158us 6.783us 51.20% 9.023us 3.008us 3
|
| 4022 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 51.20% 6.783us 2.261us 3
|
| 4023 |
+
aten::mul 1.60% 28.151us 2.82% 49.551us 16.517us 6.464us 48.80% 6.464us 2.155us 3
|
| 4024 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.80% 6.464us 2.155us 3
|
| 4025 |
+
Activity Buffer Request 83.38% 1.463ms 83.38% 1.463ms 1.463ms 2.240us 16.91% 2.240us 2.240us 1
|
| 4026 |
+
aten::slice 1.54% 26.990us 1.89% 33.190us 5.532us 0.000us 0.00% 0.000us 0.000us 6
|
| 4027 |
+
aten::as_strided 0.35% 6.200us 0.35% 6.200us 1.033us 0.000us 0.00% 0.000us 0.000us 6
|
| 4028 |
+
cudaLaunchKernel 2.79% 48.992us 2.79% 48.992us 8.165us 0.000us 0.00% 0.000us 0.000us 6
|
| 4029 |
+
cudaDeviceSynchronize 0.30% 5.190us 0.30% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
|
| 4030 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4031 |
+
Self CPU time total: 1.755ms
|
| 4032 |
+
Self CUDA time total: 13.247us
|
| 4033 |
|
| 4034 |
|
| 4035 |
|
|
|
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4041 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4042 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.719us 1246.22% 158.719us 158.719us 1
|
| 4043 |
+
torch_eager 6.58% 125.161us 99.76% 1.897ms 1.897ms 0.000us 0.00% 14.944us 14.944us 1
|
| 4044 |
+
aten::silu 2.27% 43.111us 89.01% 1.692ms 564.032us 6.560us 51.51% 8.768us 2.923us 3
|
| 4045 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
|
| 4046 |
+
aten::mul 1.36% 25.870us 2.47% 46.950us 15.650us 6.176us 48.49% 6.176us 2.059us 3
|
| 4047 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
|
| 4048 |
+
Activity Buffer Request 75.60% 1.437ms 75.60% 1.437ms 1.437ms 2.208us 17.34% 2.208us 2.208us 1
|
| 4049 |
+
aten::slice 1.39% 26.382us 1.70% 32.293us 5.382us 0.000us 0.00% 0.000us 0.000us 6
|
| 4050 |
+
aten::as_strided 0.31% 5.911us 0.31% 5.911us 0.985us 0.000us 0.00% 0.000us 0.000us 6
|
| 4051 |
+
cudaLaunchKernel 12.25% 232.925us 12.25% 232.925us 38.821us 0.000us 0.00% 0.000us 0.000us 6
|
| 4052 |
+
cudaDeviceSynchronize 0.24% 4.510us 0.24% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1
|
| 4053 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4054 |
+
Self CPU time total: 1.901ms
|
| 4055 |
+
Self CUDA time total: 12.736us
|
| 4056 |
|
| 4057 |
|
| 4058 |
|
|
|
|
| 4062 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4063 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4064 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4065 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.887us 1158.61% 153.887us 153.887us 1
|
| 4066 |
+
torch_eager 6.96% 128.034us 99.73% 1.834ms 1.834ms 0.000us 0.00% 15.586us 15.586us 1
|
| 4067 |
+
aten::silu 2.31% 42.562us 88.63% 1.630ms 543.305us 6.849us 51.57% 9.153us 3.051us 3
|
| 4068 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.849us 51.57% 6.849us 2.283us 3
|
| 4069 |
+
aten::mul 1.46% 26.931us 2.44% 44.851us 14.950us 6.433us 48.43% 6.433us 2.144us 3
|
| 4070 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.43% 6.433us 2.144us 3
|
| 4071 |
+
Activity Buffer Request 77.32% 1.422ms 77.32% 1.422ms 1.422ms 2.304us 17.35% 2.304us 2.304us 1
|
| 4072 |
+
aten::slice 1.36% 24.939us 1.70% 31.240us 5.207us 0.000us 0.00% 0.000us 0.000us 6
|
| 4073 |
+
aten::as_strided 0.34% 6.301us 0.34% 6.301us 1.050us 0.000us 0.00% 0.000us 0.000us 6
|
| 4074 |
+
cudaLaunchKernel 9.97% 183.363us 9.97% 183.363us 30.561us 0.000us 0.00% 0.000us 0.000us 6
|
| 4075 |
+
cudaDeviceSynchronize 0.27% 4.900us 0.27% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
|
| 4076 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4077 |
+
Self CPU time total: 1.839ms
|
| 4078 |
+
Self CUDA time total: 13.282us
|
| 4079 |
|
| 4080 |
|
| 4081 |
|
|
|
|
| 4085 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4086 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4087 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4088 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.278us 1009.23% 157.278us 157.278us 1
|
| 4089 |
+
torch_eager 8.12% 150.915us 99.71% 1.854ms 1.854ms 0.000us 0.00% 18.272us 18.272us 1
|
| 4090 |
+
aten::silu 2.38% 44.260us 87.35% 1.624ms 541.305us 8.000us 51.33% 10.688us 3.563us 3
|
| 4091 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 51.33% 8.000us 2.667us 3
|
| 4092 |
+
aten::mul 1.41% 26.151us 2.51% 46.701us 15.567us 7.584us 48.67% 7.584us 2.528us 3
|
| 4093 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.67% 7.584us 2.528us 3
|
| 4094 |
+
Activity Buffer Request 76.39% 1.420ms 76.39% 1.420ms 1.420ms 2.688us 17.25% 2.688us 2.688us 1
|
| 4095 |
+
aten::slice 1.39% 25.840us 1.73% 32.160us 5.360us 0.000us 0.00% 0.000us 0.000us 6
|
| 4096 |
+
aten::as_strided 0.34% 6.320us 0.34% 6.320us 1.053us 0.000us 0.00% 0.000us 0.000us 6
|
| 4097 |
+
cudaLaunchKernel 9.68% 179.994us 9.68% 179.994us 29.999us 0.000us 0.00% 0.000us 0.000us 6
|
| 4098 |
+
cudaDeviceSynchronize 0.29% 5.351us 0.29% 5.351us 5.351us 0.000us 0.00% 0.000us 0.000us 1
|
| 4099 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4100 |
+
Self CPU time total: 1.859ms
|
| 4101 |
+
Self CUDA time total: 15.584us
|
| 4102 |
|
| 4103 |
|
| 4104 |
|
|
|
|
| 4108 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4109 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4110 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4111 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 161.728us 1130.65% 161.728us 161.728us 1
|
| 4112 |
+
torch_eager 7.31% 130.302us 99.73% 1.777ms 1.777ms 0.000us 0.00% 16.768us 16.768us 1
|
| 4113 |
+
aten::silu 2.39% 42.651us 87.87% 1.566ms 521.901us 7.328us 51.23% 9.792us 3.264us 3
|
| 4114 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3
|
| 4115 |
+
aten::mul 1.55% 27.651us 2.68% 47.751us 15.917us 6.976us 48.77% 6.976us 2.325us 3
|
| 4116 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
|
| 4117 |
+
Activity Buffer Request 76.61% 1.365ms 76.61% 1.365ms 1.365ms 2.464us 17.23% 2.464us 2.464us 1
|
| 4118 |
+
aten::slice 1.50% 26.642us 1.87% 33.262us 5.544us 0.000us 0.00% 0.000us 0.000us 6
|
| 4119 |
+
aten::as_strided 0.37% 6.620us 0.37% 6.620us 1.103us 0.000us 0.00% 0.000us 0.000us 6
|
| 4120 |
+
cudaLaunchKernel 9.99% 177.974us 9.99% 177.974us 29.662us 0.000us 0.00% 0.000us 0.000us 6
|
| 4121 |
+
cudaDeviceSynchronize 0.27% 4.870us 0.27% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
|
| 4122 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4123 |
+
Self CPU time total: 1.782ms
|
| 4124 |
+
Self CUDA time total: 14.304us
|
| 4125 |
|
| 4126 |
|
| 4127 |
|
|
|
|
| 4131 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4132 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4133 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4134 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.416us 1005.63% 156.416us 156.416us 1
|
| 4135 |
+
torch_eager 7.17% 130.703us 99.74% 1.819ms 1.819ms 0.000us 0.00% 18.243us 18.243us 1
|
| 4136 |
+
aten::silu 2.30% 42.032us 88.31% 1.611ms 536.959us 7.970us 51.24% 10.659us 3.553us 3
|
| 4137 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.970us 51.24% 7.970us 2.657us 3
|
| 4138 |
+
aten::mul 1.41% 25.800us 2.54% 46.410us 15.470us 7.584us 48.76% 7.584us 2.528us 3
|
| 4139 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.76% 7.584us 2.528us 3
|
| 4140 |
+
Activity Buffer Request 77.37% 1.411ms 77.37% 1.411ms 1.411ms 2.689us 17.29% 2.689us 2.689us 1
|
| 4141 |
+
aten::slice 1.41% 25.640us 1.72% 31.370us 5.228us 0.000us 0.00% 0.000us 0.000us 6
|
| 4142 |
+
aten::as_strided 0.31% 5.730us 0.31% 5.730us 0.955us 0.000us 0.00% 0.000us 0.000us 6
|
| 4143 |
+
cudaLaunchKernel 9.77% 178.145us 9.77% 178.145us 29.691us 0.000us 0.00% 0.000us 0.000us 6
|
| 4144 |
+
cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
|
| 4145 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4146 |
+
Self CPU time total: 1.824ms
|
| 4147 |
+
Self CUDA time total: 15.554us
|
| 4148 |
|
| 4149 |
|
| 4150 |
|
|
|
|
| 4154 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4155 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4156 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4157 |
+
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.390us 709.54% 159.390us 159.390us 1
|
| 4158 |
+
torch_eager 6.97% 127.342us 99.74% 1.823ms 1.823ms 0.000us 0.00% 26.336us 26.336us 1
|
| 4159 |
+
aten::silu 2.35% 42.870us 88.50% 1.617ms 539.138us 11.520us 51.28% 15.392us 5.131us 3
|
| 4160 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 51.28% 11.520us 3.840us 3
|
| 4161 |
+
aten::mul 1.55% 28.251us 2.57% 47.051us 15.684us 10.944us 48.72% 10.944us 3.648us 3
|
| 4162 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.72% 10.944us 3.648us 3
|
| 4163 |
+
Activity Buffer Request 77.70% 1.420ms 77.70% 1.420ms 1.420ms 3.872us 17.24% 3.872us 3.872us 1
|
| 4164 |
+
aten::slice 1.38% 25.151us 1.70% 31.112us 5.185us 0.000us 0.00% 0.000us 0.000us 6
|
| 4165 |
+
aten::as_strided 0.33% 5.961us 0.33% 5.961us 0.993us 0.000us 0.00% 0.000us 0.000us 6
|
| 4166 |
+
cudaLaunchKernel 9.48% 173.263us 9.48% 173.263us 28.877us 0.000us 0.00% 0.000us 0.000us 6
|
| 4167 |
+
cudaDeviceSynchronize 0.26% 4.721us 0.26% 4.721us 4.721us 0.000us 0.00% 0.000us 0.000us 1
|
| 4168 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4169 |
+
Self CPU time total: 1.828ms
|
| 4170 |
+
Self CUDA time total: 22.464us
|
| 4171 |
|
| 4172 |
|
| 4173 |
impl wl p50(ms) ok
|
|
|
|
| 4181 |
torch_eager cuda_T512_D2048 0.05 True
|
| 4182 |
torch_eager cuda_T512_D768 0.05 True
|
| 4183 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4184 |
<div class="cell-artifacts">
|
| 4185 |
<h4>Artifacts:</h4>
|
| 4186 |
<a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
|
activation/results/artifacts/combine/latency.svg
CHANGED
|
|
|
|
activation/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4021,83 +4021,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
-
<path d="M 60.23
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
-
<path d="M 60.23
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
-
<path d="M 60.23
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
-
<path d="M 60.23
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
-
<path d="M 60.23
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
-
<path d="M 60.23
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="label--y" class="ylabel">
|
|
@@ -4105,37 +4105,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4108 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4109 |
<defs>
|
| 4110 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4111 |
</defs>
|
| 4112 |
<g clip-path="url(#p620c7d392f)">
|
| 4113 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4114 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4115 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4116 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4117 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4118 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4119 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4120 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4121 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--torch-eager" class="series">
|
| 4125 |
-
<path d="M 96.005644
|
| 4126 |
<defs>
|
| 4127 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4131 |
-
<use ns4:href="#m9b8c54d372" x="185.444754" y="
|
| 4132 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4133 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4134 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4135 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4136 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4137 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="82.
|
| 4138 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="patch_3">
|
|
@@ -4193,7 +4193,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4193 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4194 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4195 |
</span> |
|
| 4196 |
-
Cell: combine | 4.
|
| 4197 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4198 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4199 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4284,7 +4284,7 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4284 |
impl wl p50(ms) ok
|
| 4285 |
hf_kernels_swiglu cuda_T128_D1024 0.03 True
|
| 4286 |
hf_kernels_swiglu cuda_T128_D2048 0.03 True
|
| 4287 |
-
hf_kernels_swiglu cuda_T128_D768 0.
|
| 4288 |
hf_kernels_swiglu cuda_T256_D1024 0.03 True
|
| 4289 |
hf_kernels_swiglu cuda_T256_D2048 0.03 True
|
| 4290 |
hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
@@ -4319,7 +4319,7 @@ Implementations included:
|
|
| 4319 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4320 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4321 |
<div class="uv-logs-content" style="display: none;">
|
| 4322 |
-
Installed 37 packages in
|
| 4323 |
</div>
|
| 4324 |
</div>
|
| 4325 |
<div class="cell-artifacts">
|
|
@@ -4332,7 +4332,7 @@ Installed 37 packages in 208ms
|
|
| 4332 |
<rdf:RDF>
|
| 4333 |
<ns2:Work>
|
| 4334 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4335 |
-
<dc:date>2025-10-
|
| 4336 |
<dc:format>image/svg+xml</dc:format>
|
| 4337 |
<dc:creator>
|
| 4338 |
<ns2:Agent>
|
|
@@ -4481,83 +4481,83 @@ Installed 37 packages in 208ms
|
|
| 4481 |
<g id="matplotlib.axis_2">
|
| 4482 |
<g id="ytick_1">
|
| 4483 |
<g id="grid-y--2" class="grid grid-y">
|
| 4484 |
-
<path d="M 60.23
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_10">
|
| 4487 |
<defs>
|
| 4488 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</defs>
|
| 4490 |
<g>
|
| 4491 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="text_10">
|
| 4495 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="ytick_2">
|
| 4499 |
<g id="grid-y--3" class="grid grid-y">
|
| 4500 |
-
<path d="M 60.23
|
| 4501 |
</g>
|
| 4502 |
<g id="line2d_11">
|
| 4503 |
<g>
|
| 4504 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_11">
|
| 4508 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_3">
|
| 4512 |
<g id="grid-y--4" class="grid grid-y">
|
| 4513 |
-
<path d="M 60.23
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_12">
|
| 4516 |
<g>
|
| 4517 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_12">
|
| 4521 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_4">
|
| 4525 |
<g id="grid-y--5" class="grid grid-y">
|
| 4526 |
-
<path d="M 60.23
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_13">
|
| 4529 |
<g>
|
| 4530 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_13">
|
| 4534 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_5">
|
| 4538 |
<g id="grid-y--6" class="grid grid-y">
|
| 4539 |
-
<path d="M 60.23
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_14">
|
| 4542 |
<g>
|
| 4543 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_14">
|
| 4547 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_6">
|
| 4551 |
<g id="grid-y--7" class="grid grid-y">
|
| 4552 |
-
<path d="M 60.23
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_15">
|
| 4555 |
<g>
|
| 4556 |
-
<use ns4:href="#m0fca2865ba" x="60.23" y="
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_15">
|
| 4560 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="label--y" class="ylabel">
|
|
@@ -4565,37 +4565,37 @@ Installed 37 packages in 208ms
|
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4568 |
-
<path d="M 96.005644 451.16779 L 185.444754
|
| 4569 |
<defs>
|
| 4570 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4571 |
</defs>
|
| 4572 |
<g clip-path="url(#p620c7d392f)">
|
| 4573 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4574 |
-
<use ns4:href="#md7efaf3aec" x="185.444754" y="
|
| 4575 |
-
<use ns4:href="#md7efaf3aec" x="274.883864" y="
|
| 4576 |
-
<use ns4:href="#md7efaf3aec" x="364.322974" y="
|
| 4577 |
-
<use ns4:href="#md7efaf3aec" x="453.762084" y="
|
| 4578 |
-
<use ns4:href="#md7efaf3aec" x="543.201194" y="
|
| 4579 |
-
<use ns4:href="#md7efaf3aec" x="632.640304" y="
|
| 4580 |
-
<use ns4:href="#md7efaf3aec" x="722.079415" y="
|
| 4581 |
-
<use ns4:href="#md7efaf3aec" x="811.518525" y="
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="series--torch-eager" class="series">
|
| 4585 |
-
<path d="M 96.005644
|
| 4586 |
<defs>
|
| 4587 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4588 |
</defs>
|
| 4589 |
<g clip-path="url(#p620c7d392f)">
|
| 4590 |
-
<use ns4:href="#m9b8c54d372" x="96.005644" y="
|
| 4591 |
-
<use ns4:href="#m9b8c54d372" x="185.444754" y="
|
| 4592 |
-
<use ns4:href="#m9b8c54d372" x="274.883864" y="
|
| 4593 |
-
<use ns4:href="#m9b8c54d372" x="364.322974" y="
|
| 4594 |
-
<use ns4:href="#m9b8c54d372" x="453.762084" y="
|
| 4595 |
-
<use ns4:href="#m9b8c54d372" x="543.201194" y="
|
| 4596 |
-
<use ns4:href="#m9b8c54d372" x="632.640304" y="
|
| 4597 |
-
<use ns4:href="#m9b8c54d372" x="722.079415" y="82.
|
| 4598 |
-
<use ns4:href="#m9b8c54d372" x="811.518525" y="
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="patch_3">
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T04:14:49.758878</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 4021 |
<g id="matplotlib.axis_2">
|
| 4022 |
<g id="ytick_1">
|
| 4023 |
<g id="grid-y--2" class="grid grid-y">
|
| 4024 |
+
<path d="M 60.23 454.34229 L 847.294169 454.34229 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4025 |
</g>
|
| 4026 |
<g id="line2d_10">
|
| 4027 |
<defs>
|
| 4028 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4029 |
</defs>
|
| 4030 |
<g>
|
| 4031 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="454.34229" style="stroke: #000000; stroke-width: 0.8" />
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="458.141509" transform="rotate(-0 53.23 458.141509)">0.025</text>
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_2">
|
| 4039 |
<g id="grid-y--3" class="grid grid-y">
|
| 4040 |
+
<path d="M 60.23 378.758958 L 847.294169 378.758958 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="378.758958" style="stroke: #000000; stroke-width: 0.8" />
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="382.558177" transform="rotate(-0 53.23 382.558177)">0.030</text>
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_3">
|
| 4052 |
<g id="grid-y--4" class="grid grid-y">
|
| 4053 |
+
<path d="M 60.23 303.175626 L 847.294169 303.175626 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="303.175626" style="stroke: #000000; stroke-width: 0.8" />
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="306.974844" transform="rotate(-0 53.23 306.974844)">0.035</text>
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_4">
|
| 4065 |
<g id="grid-y--5" class="grid grid-y">
|
| 4066 |
+
<path d="M 60.23 227.592294 L 847.294169 227.592294 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="227.592294" style="stroke: #000000; stroke-width: 0.8" />
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="231.391512" transform="rotate(-0 53.23 231.391512)">0.040</text>
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="ytick_5">
|
| 4078 |
<g id="grid-y--6" class="grid grid-y">
|
| 4079 |
+
<path d="M 60.23 152.008962 L 847.294169 152.008962 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4080 |
</g>
|
| 4081 |
<g id="line2d_14">
|
| 4082 |
<g>
|
| 4083 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="152.008962" style="stroke: #000000; stroke-width: 0.8" />
|
| 4084 |
</g>
|
| 4085 |
</g>
|
| 4086 |
<g id="text_14">
|
| 4087 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="155.80818" transform="rotate(-0 53.23 155.80818)">0.045</text>
|
| 4088 |
</g>
|
| 4089 |
</g>
|
| 4090 |
<g id="ytick_6">
|
| 4091 |
<g id="grid-y--7" class="grid grid-y">
|
| 4092 |
+
<path d="M 60.23 76.42563 L 847.294169 76.42563 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4093 |
</g>
|
| 4094 |
<g id="line2d_15">
|
| 4095 |
<g>
|
| 4096 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="76.42563" style="stroke: #000000; stroke-width: 0.8" />
|
| 4097 |
</g>
|
| 4098 |
</g>
|
| 4099 |
<g id="text_15">
|
| 4100 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="80.224848" transform="rotate(-0 53.23 80.224848)">0.050</text>
|
| 4101 |
</g>
|
| 4102 |
</g>
|
| 4103 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4105 |
</g>
|
| 4106 |
</g>
|
| 4107 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4108 |
+
<path d="M 96.005644 451.16779 L 185.444754 350.778008 L 274.883864 379.197341 L 364.322974 380.406674 L 453.762084 374.828625 L 543.201194 374.360008 L 632.640304 380.406674 L 722.079415 389.174341 L 811.518525 390.081341 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4109 |
<defs>
|
| 4110 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4111 |
</defs>
|
| 4112 |
<g clip-path="url(#p620c7d392f)">
|
| 4113 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4114 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="350.778008" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4115 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="379.197341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4116 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="380.406674" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4117 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="374.828625" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4118 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="374.360008" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4119 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="380.406674" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4120 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="389.174341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4121 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="390.081341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4122 |
</g>
|
| 4123 |
</g>
|
| 4124 |
<g id="series--torch-eager" class="series">
|
| 4125 |
+
<path d="M 96.005644 226.821344 L 185.444754 49.351681 L 274.883864 47.08418 L 364.322974 50.863347 L 453.762084 60.522897 L 543.201194 66.433513 L 632.640304 64.755563 L 722.079415 82.895563 L 811.518525 77.619847 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4126 |
<defs>
|
| 4127 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4128 |
</defs>
|
| 4129 |
<g clip-path="url(#p620c7d392f)">
|
| 4130 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="226.821344" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4131 |
+
<use ns4:href="#m9b8c54d372" x="185.444754" y="49.351681" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4132 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4133 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="50.863347" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4134 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="60.522897" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4135 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="66.433513" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4136 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="64.755563" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4137 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="82.895563" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4138 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="77.619847" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4139 |
</g>
|
| 4140 |
</g>
|
| 4141 |
<g id="patch_3">
|
|
|
|
| 4193 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4194 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4195 |
</span> |
|
| 4196 |
+
Cell: combine | 4.31s
|
| 4197 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4198 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4199 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4284 |
impl wl p50(ms) ok
|
| 4285 |
hf_kernels_swiglu cuda_T128_D1024 0.03 True
|
| 4286 |
hf_kernels_swiglu cuda_T128_D2048 0.03 True
|
| 4287 |
+
hf_kernels_swiglu cuda_T128_D768 0.03 True
|
| 4288 |
hf_kernels_swiglu cuda_T256_D1024 0.03 True
|
| 4289 |
hf_kernels_swiglu cuda_T256_D2048 0.03 True
|
| 4290 |
hf_kernels_swiglu cuda_T256_D768 0.03 True
|
|
|
|
| 4319 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4320 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4321 |
<div class="uv-logs-content" style="display: none;">
|
| 4322 |
+
Installed 37 packages in 238ms
|
| 4323 |
</div>
|
| 4324 |
</div>
|
| 4325 |
<div class="cell-artifacts">
|
|
|
|
| 4332 |
<rdf:RDF>
|
| 4333 |
<ns2:Work>
|
| 4334 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4335 |
+
<dc:date>2025-10-29T04:14:49.758878</dc:date>
|
| 4336 |
<dc:format>image/svg+xml</dc:format>
|
| 4337 |
<dc:creator>
|
| 4338 |
<ns2:Agent>
|
|
|
|
| 4481 |
<g id="matplotlib.axis_2">
|
| 4482 |
<g id="ytick_1">
|
| 4483 |
<g id="grid-y--2" class="grid grid-y">
|
| 4484 |
+
<path d="M 60.23 454.34229 L 847.294169 454.34229 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4485 |
</g>
|
| 4486 |
<g id="line2d_10">
|
| 4487 |
<defs>
|
| 4488 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4489 |
</defs>
|
| 4490 |
<g>
|
| 4491 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="454.34229" style="stroke: #000000; stroke-width: 0.8" />
|
| 4492 |
</g>
|
| 4493 |
</g>
|
| 4494 |
<g id="text_10">
|
| 4495 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="458.141509" transform="rotate(-0 53.23 458.141509)">0.025</text>
|
| 4496 |
</g>
|
| 4497 |
</g>
|
| 4498 |
<g id="ytick_2">
|
| 4499 |
<g id="grid-y--3" class="grid grid-y">
|
| 4500 |
+
<path d="M 60.23 378.758958 L 847.294169 378.758958 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4501 |
</g>
|
| 4502 |
<g id="line2d_11">
|
| 4503 |
<g>
|
| 4504 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="378.758958" style="stroke: #000000; stroke-width: 0.8" />
|
| 4505 |
</g>
|
| 4506 |
</g>
|
| 4507 |
<g id="text_11">
|
| 4508 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="382.558177" transform="rotate(-0 53.23 382.558177)">0.030</text>
|
| 4509 |
</g>
|
| 4510 |
</g>
|
| 4511 |
<g id="ytick_3">
|
| 4512 |
<g id="grid-y--4" class="grid grid-y">
|
| 4513 |
+
<path d="M 60.23 303.175626 L 847.294169 303.175626 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4514 |
</g>
|
| 4515 |
<g id="line2d_12">
|
| 4516 |
<g>
|
| 4517 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="303.175626" style="stroke: #000000; stroke-width: 0.8" />
|
| 4518 |
</g>
|
| 4519 |
</g>
|
| 4520 |
<g id="text_12">
|
| 4521 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="306.974844" transform="rotate(-0 53.23 306.974844)">0.035</text>
|
| 4522 |
</g>
|
| 4523 |
</g>
|
| 4524 |
<g id="ytick_4">
|
| 4525 |
<g id="grid-y--5" class="grid grid-y">
|
| 4526 |
+
<path d="M 60.23 227.592294 L 847.294169 227.592294 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4527 |
</g>
|
| 4528 |
<g id="line2d_13">
|
| 4529 |
<g>
|
| 4530 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="227.592294" style="stroke: #000000; stroke-width: 0.8" />
|
| 4531 |
</g>
|
| 4532 |
</g>
|
| 4533 |
<g id="text_13">
|
| 4534 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="231.391512" transform="rotate(-0 53.23 231.391512)">0.040</text>
|
| 4535 |
</g>
|
| 4536 |
</g>
|
| 4537 |
<g id="ytick_5">
|
| 4538 |
<g id="grid-y--6" class="grid grid-y">
|
| 4539 |
+
<path d="M 60.23 152.008962 L 847.294169 152.008962 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4540 |
</g>
|
| 4541 |
<g id="line2d_14">
|
| 4542 |
<g>
|
| 4543 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="152.008962" style="stroke: #000000; stroke-width: 0.8" />
|
| 4544 |
</g>
|
| 4545 |
</g>
|
| 4546 |
<g id="text_14">
|
| 4547 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="155.80818" transform="rotate(-0 53.23 155.80818)">0.045</text>
|
| 4548 |
</g>
|
| 4549 |
</g>
|
| 4550 |
<g id="ytick_6">
|
| 4551 |
<g id="grid-y--7" class="grid grid-y">
|
| 4552 |
+
<path d="M 60.23 76.42563 L 847.294169 76.42563 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4553 |
</g>
|
| 4554 |
<g id="line2d_15">
|
| 4555 |
<g>
|
| 4556 |
+
<use ns4:href="#m0fca2865ba" x="60.23" y="76.42563" style="stroke: #000000; stroke-width: 0.8" />
|
| 4557 |
</g>
|
| 4558 |
</g>
|
| 4559 |
<g id="text_15">
|
| 4560 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="80.224848" transform="rotate(-0 53.23 80.224848)">0.050</text>
|
| 4561 |
</g>
|
| 4562 |
</g>
|
| 4563 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4565 |
</g>
|
| 4566 |
</g>
|
| 4567 |
<g id="series--hf-kernels-swiglu" class="series">
|
| 4568 |
+
<path d="M 96.005644 451.16779 L 185.444754 350.778008 L 274.883864 379.197341 L 364.322974 380.406674 L 453.762084 374.828625 L 543.201194 374.360008 L 632.640304 380.406674 L 722.079415 389.174341 L 811.518525 390.081341 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4569 |
<defs>
|
| 4570 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4571 |
</defs>
|
| 4572 |
<g clip-path="url(#p620c7d392f)">
|
| 4573 |
<use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4574 |
+
<use ns4:href="#md7efaf3aec" x="185.444754" y="350.778008" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4575 |
+
<use ns4:href="#md7efaf3aec" x="274.883864" y="379.197341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4576 |
+
<use ns4:href="#md7efaf3aec" x="364.322974" y="380.406674" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4577 |
+
<use ns4:href="#md7efaf3aec" x="453.762084" y="374.828625" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4578 |
+
<use ns4:href="#md7efaf3aec" x="543.201194" y="374.360008" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4579 |
+
<use ns4:href="#md7efaf3aec" x="632.640304" y="380.406674" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4580 |
+
<use ns4:href="#md7efaf3aec" x="722.079415" y="389.174341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4581 |
+
<use ns4:href="#md7efaf3aec" x="811.518525" y="390.081341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4582 |
</g>
|
| 4583 |
</g>
|
| 4584 |
<g id="series--torch-eager" class="series">
|
| 4585 |
+
<path d="M 96.005644 226.821344 L 185.444754 49.351681 L 274.883864 47.08418 L 364.322974 50.863347 L 453.762084 60.522897 L 543.201194 66.433513 L 632.640304 64.755563 L 722.079415 82.895563 L 811.518525 77.619847 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4586 |
<defs>
|
| 4587 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4588 |
</defs>
|
| 4589 |
<g clip-path="url(#p620c7d392f)">
|
| 4590 |
+
<use ns4:href="#m9b8c54d372" x="96.005644" y="226.821344" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4591 |
+
<use ns4:href="#m9b8c54d372" x="185.444754" y="49.351681" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4592 |
+
<use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4593 |
+
<use ns4:href="#m9b8c54d372" x="364.322974" y="50.863347" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4594 |
+
<use ns4:href="#m9b8c54d372" x="453.762084" y="60.522897" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4595 |
+
<use ns4:href="#m9b8c54d372" x="543.201194" y="66.433513" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4596 |
+
<use ns4:href="#m9b8c54d372" x="632.640304" y="64.755563" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4597 |
+
<use ns4:href="#m9b8c54d372" x="722.079415" y="82.895563" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4598 |
+
<use ns4:href="#m9b8c54d372" x="811.518525" y="77.619847" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4599 |
</g>
|
| 4600 |
</g>
|
| 4601 |
<g id="patch_3">
|
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
| 10 |
-
{"ts": "2025-10-
|
| 11 |
-
{"ts": "2025-10-
|
| 12 |
-
{"ts": "2025-10-
|
| 13 |
-
{"ts": "2025-10-
|
| 14 |
-
{"ts": "2025-10-
|
| 15 |
-
{"ts": "2025-10-
|
| 16 |
-
{"ts": "2025-10-
|
| 17 |
-
{"ts": "2025-10-
|
| 18 |
-
{"ts": "2025-10-
|
| 19 |
-
{"ts": "2025-10-
|
| 20 |
-
{"ts": "2025-10-
|
| 21 |
-
{"ts": "2025-10-
|
| 22 |
-
{"ts": "2025-10-
|
| 23 |
-
{"ts": "2025-10-
|
| 24 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047261000020171196, "p50": 0.04859200004148079, "p90": 0.0489899999820409, "mean": 0.048763200015855546, "iqr": 0.0006179999445521389, "raw_times": [0.050600999998096086, 0.0489899999820409, 0.04837200003748876, 0.04859200004148079, 0.047261000020171196], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06049099999927421, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053621000006387476, "p50": 0.05462100000386272, "p90": 0.05485100001578758, "mean": 0.054479000004903355, "iqr": 0.0006300000450210064, "raw_times": [0.05462100000386272, 0.053621000006387476, 0.05485100001578758, 0.055081000027712435, 0.05422099997076657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05994100001771585, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052632000006269664, "p50": 0.054361000024982786, "p90": 0.05462100000386272, "mean": 0.05404320000934604, "iqr": 0.0009999999974752427, "raw_times": [0.054361000024982786, 0.05498100000522754, 0.053621000006387476, 0.05462100000386272, 0.052632000006269664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058602000024166045, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05227200000490484, "p50": 0.05285200001026169, "p90": 0.053781000019625935, "mean": 0.05329160001110722, "iqr": 0.0009899999895424116, "raw_times": [0.05285200001026169, 0.05227200000490484, 0.054761999990660115, 0.053781000019625935, 0.052791000030083524], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05603200003179154, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05200100002866748, "p50": 0.054180999995878665, "p90": 0.05433100000118429, "mean": 0.05350320001298314, "iqr": 0.0019299999962640868, "raw_times": [0.054602000034265075, 0.05433100000118429, 0.052401000004920206, 0.05200100002866748, 0.054180999995878665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05716200001870675, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05303100004994121, "p50": 0.05431100004216205, "p90": 0.05439099999193786, "mean": 0.053947000014886726, "iqr": 0.0011999999856016075, "raw_times": [0.05439099999193786, 0.05481099998405625, 0.05319100000633625, 0.05431100004216205, 0.05303100004994121], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05832099998315243, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05105100001401297, "p50": 0.05330099997991056, "p90": 0.05380099997864818, "mean": 0.054202999990593526, "iqr": 0.0006199999802447564, "raw_times": [0.05105100001401297, 0.05330099997991056, 0.05318099999840342, 0.0596809999819925, 0.05380099997864818], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056061000009322015, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164100002730265, "p50": 0.052870999979859334, "p90": 0.05319100000633625, "mean": 0.05809520000639168, "iqr": 0.0004199999921183917, "raw_times": [0.05277100001421786, 0.052870999979859334, 0.05319100000633625, 0.05164100002730265, 0.08000200000424229], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05684100000280523, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050990999966415984, "p50": 0.05245100004458436, "p90": 0.05260099999304657, "mean": 0.05224100000305043, "iqr": 0.0006000000212225132, "raw_times": [0.050990999966415984, 0.05316100003938118, 0.05200099997182406, 0.05260099999304657, 0.05245100004458436], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05579200001193385, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05112100001269937, "p50": 0.0527509999983522, "p90": 0.053382000032797805, "mean": 0.05273720000786852, "iqr": 0.0022010000293448684, "raw_times": [0.05112100001269937, 0.0527509999983522, 0.055250999992040306, 0.053382000032797805, 0.05118100000345294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05559099997753947, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051831000007496186, "p50": 0.05281099998910577, "p90": 0.0528209999970386, "mean": 0.05249119999461982, "iqr": 0.0007599999776175537, "raw_times": [0.0528209999970386, 0.052061000019421044, 0.051831000007496186, 0.05281099998910577, 0.0529319999600375], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05652100003317173, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05226100000754741, "p50": 0.053051999998388055, "p90": 0.053290999971977726, "mean": 0.05317119998835551, "iqr": 0.00048099997229655855, "raw_times": [0.05226100000754741, 0.054441999964183196, 0.053051999998388055, 0.05280999999968117, 0.053290999971977726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758100002140054, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05227100001548024, "p50": 0.05277099995737444, "p90": 0.05359099998258898, "mean": 0.053112999989934906, "iqr": 0.0010999999631167157, "raw_times": [0.05227100001548024, 0.0544409999747586, 0.05359099998258898, 0.05249100001947227, 0.05277099995737444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05578100001457642, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051241999983631104, "p50": 0.052721000031397125, "p90": 0.05347100000108185, "mean": 0.05272120000654468, "iqr": 0.0009499999578110874, "raw_times": [0.05252100004327076, 0.052721000031397125, 0.05347100000108185, 0.05365099997334255, 0.051241999983631104], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07047099995816097, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0512010000193186, "p50": 0.052960999994411395, "p90": 0.05432099999325146, "mean": 0.05357720000347399, "iqr": 0.0024799999778224446, "raw_times": [0.0512010000193186, 0.05756199999495948, 0.05184100001542902, 0.05432099999325146, 0.052960999994411395], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05626099999744838, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05130099998496007, "p50": 0.05301099997723213, "p90": 0.05393200001435616, "mean": 0.05291720000286659, "iqr": 0.0015410000173687877, "raw_times": [0.053951000040797226, 0.05393200001435616, 0.05130099998496007, 0.05301099997723213, 0.052390999996987375], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056620999998813204, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052390999996987375, "p50": 0.052880999987792165, "p90": 0.05298999997194187, "mean": 0.05283679998910884, "iqr": 0.00045899997758169775, "raw_times": [0.05253099999436017, 0.05339099999446262, 0.052390999996987375, 0.052880999987792165, 0.05298999997194187], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057541000046512636, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05287200002612735, "p50": 0.05326199999444725, "p90": 0.05488099998274265, "mean": 0.05554340000344382, "iqr": 0.0019099999803984247, "raw_times": [0.05287200002612735, 0.05326199999444725, 0.052971000002344226, 0.06373100001155763, 0.05488099998274265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05706100000679726, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05237200002738973, "p50": 0.05340200004866347, "p90": 0.054441999964183196, "mean": 0.05358960000876323, "iqr": 0.0017409999486517336, "raw_times": [0.05237200002738973, 0.05503099998804828, 0.054441999964183196, 0.05340200004866347, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056061000009322015, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05089100000077451, "p50": 0.054600999987997056, "p90": 0.054670999986683455, "mean": 0.05481719998670087, "iqr": 0.001720000000204891, "raw_times": [0.05089100000077451, 0.054670999986683455, 0.054600999987997056, 0.06097199997157077, 0.052950999986478564], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05628200000273864, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052641000024777895, "p50": 0.05274100004726279, "p90": 0.05537099997354744, "mean": 0.05482900000970403, "iqr": 0.002670000014859397, "raw_times": [0.05274100004726279, 0.052700999958688044, 0.05537099997354744, 0.06069100004424399, 0.052641000024777895], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056130999951164995, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05228100002341307, "p50": 0.05332099999577622, "p90": 0.053600999990521814, "mean": 0.05359900000030393, "iqr": 0.0007799999934832158, "raw_times": [0.05228100002341307, 0.05332099999577622, 0.0528209999970386, 0.055970999994769954, 0.053600999990521814], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05611099999214275, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0509510000483715, "p50": 0.05250099997056168, "p90": 0.05270100001553146, "mean": 0.052225199999611505, "iqr": 0.000470000031782547, "raw_times": [0.05270100001553146, 0.0509510000483715, 0.052230999983748916, 0.05250099997056168, 0.05274199997984397], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05471000002899018, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05066099998884965, "p50": 0.05315100003144835, "p90": 0.05323099998122416, "mean": 0.052780999999413325, "iqr": 0.0009899999895424116, "raw_times": [0.05224099999168175, 0.05462100000386272, 0.05323099998122416, 0.05315100003144835, 0.05066099998884965], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05779099996061632, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
|
causal_conv1d/impls/hf_kernels_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/impls/torch_causal_conv1d.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
causal_conv1d/results/artifacts/combine/latency.svg
CHANGED
|
|
|
|
causal_conv1d/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
-
<path d="M 47.72
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
-
<path d="M 47.72
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
-
<path d="M 47.72 209.
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="209.
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
-
<path d="M 47.72
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
-
<path d="M 47.72 42.
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="42.
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
@@ -4287,66 +4287,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4290 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 4297 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4298 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 4299 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="
|
| 4300 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="415.
|
| 4301 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 4302 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 4303 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.
|
| 4304 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="
|
| 4305 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 4306 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 4307 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="
|
| 4308 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 4309 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="416.
|
| 4310 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 4311 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="
|
| 4312 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 4313 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.
|
| 4314 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 4315 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="
|
| 4316 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 4317 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 4318 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 4319 |
</g>
|
| 4320 |
</g>
|
| 4321 |
<g id="series--torch-eager" class="series">
|
| 4322 |
-
<path d="M 83.325193
|
| 4323 |
<defs>
|
| 4324 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4325 |
</defs>
|
| 4326 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4327 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 4328 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="385.
|
| 4329 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 4330 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 4331 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 4332 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 4333 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 4334 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="387.
|
| 4335 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 4336 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 4337 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 4338 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="324.
|
| 4339 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="388.
|
| 4340 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 4341 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 4342 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 4343 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 4344 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 4345 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="387.
|
| 4346 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="388.
|
| 4347 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="
|
| 4348 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.
|
| 4349 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 4350 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4351 |
</g>
|
| 4352 |
</g>
|
|
@@ -4405,7 +4405,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4405 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4406 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4407 |
</span> |
|
| 4408 |
-
Cell: combine | 4.
|
| 4409 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4410 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4411 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4516,30 +4516,30 @@ hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
|
|
| 4516 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
|
| 4517 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
|
| 4518 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
|
| 4519 |
-
torch_eager cuda_B2_D2048_S128_W2 0.
|
| 4520 |
torch_eager cuda_B2_D2048_S128_W4 0.09 True
|
| 4521 |
-
torch_eager cuda_B2_D2048_S2048_W2 0.
|
| 4522 |
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
|
| 4523 |
-
torch_eager cuda_B2_D2048_S512_W2 0.
|
| 4524 |
-
torch_eager cuda_B2_D2048_S512_W4 0.
|
| 4525 |
torch_eager cuda_B2_D64_S128_W2 0.07 True
|
| 4526 |
torch_eager cuda_B2_D64_S128_W4 0.09 True
|
| 4527 |
torch_eager cuda_B2_D64_S2048_W2 0.09 True
|
| 4528 |
-
torch_eager cuda_B2_D64_S2048_W4 0.
|
| 4529 |
torch_eager cuda_B2_D64_S512_W2 0.09 True
|
| 4530 |
torch_eager cuda_B2_D64_S512_W4 0.09 True
|
| 4531 |
torch_eager cuda_B4_D2048_S128_W2 0.09 True
|
| 4532 |
-
torch_eager cuda_B4_D2048_S128_W4 0.
|
| 4533 |
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
|
| 4534 |
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
|
| 4535 |
-
torch_eager cuda_B4_D2048_S512_W2 0.
|
| 4536 |
torch_eager cuda_B4_D2048_S512_W4 0.10 True
|
| 4537 |
-
torch_eager cuda_B4_D64_S128_W2 0.
|
| 4538 |
-
torch_eager cuda_B4_D64_S128_W4 0.
|
| 4539 |
-
torch_eager cuda_B4_D64_S2048_W2 0.
|
| 4540 |
torch_eager cuda_B4_D64_S2048_W4 0.09 True
|
| 4541 |
-
torch_eager cuda_B4_D64_S512_W2 0.
|
| 4542 |
-
torch_eager cuda_B4_D64_S512_W4 0.
|
| 4543 |
|
| 4544 |
GENERATING COMBINED VISUALIZATION
|
| 4545 |
|
|
@@ -4559,7 +4559,7 @@ Implementations included:
|
|
| 4559 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4560 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4561 |
<div class="uv-logs-content" style="display: none;">
|
| 4562 |
-
Installed 37 packages in
|
| 4563 |
</div>
|
| 4564 |
</div>
|
| 4565 |
<div class="cell-artifacts">
|
|
@@ -4572,7 +4572,7 @@ Installed 37 packages in 239ms
|
|
| 4572 |
<rdf:RDF>
|
| 4573 |
<ns2:Work>
|
| 4574 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4575 |
-
<dc:date>2025-10-
|
| 4576 |
<dc:format>image/svg+xml</dc:format>
|
| 4577 |
<dc:creator>
|
| 4578 |
<ns2:Agent>
|
|
@@ -4916,70 +4916,70 @@ Installed 37 packages in 239ms
|
|
| 4916 |
<g id="matplotlib.axis_2">
|
| 4917 |
<g id="ytick_1">
|
| 4918 |
<g id="grid-y--2" class="grid grid-y">
|
| 4919 |
-
<path d="M 47.72
|
| 4920 |
</g>
|
| 4921 |
<g id="line2d_25">
|
| 4922 |
<defs>
|
| 4923 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4924 |
</defs>
|
| 4925 |
<g>
|
| 4926 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4927 |
</g>
|
| 4928 |
</g>
|
| 4929 |
<g id="text_25">
|
| 4930 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4931 |
</g>
|
| 4932 |
</g>
|
| 4933 |
<g id="ytick_2">
|
| 4934 |
<g id="grid-y--3" class="grid grid-y">
|
| 4935 |
-
<path d="M 47.72
|
| 4936 |
</g>
|
| 4937 |
<g id="line2d_26">
|
| 4938 |
<g>
|
| 4939 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4940 |
</g>
|
| 4941 |
</g>
|
| 4942 |
<g id="text_26">
|
| 4943 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="ytick_3">
|
| 4947 |
<g id="grid-y--4" class="grid grid-y">
|
| 4948 |
-
<path d="M 47.72 209.
|
| 4949 |
</g>
|
| 4950 |
<g id="line2d_27">
|
| 4951 |
<g>
|
| 4952 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="209.
|
| 4953 |
</g>
|
| 4954 |
</g>
|
| 4955 |
<g id="text_27">
|
| 4956 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="ytick_4">
|
| 4960 |
<g id="grid-y--5" class="grid grid-y">
|
| 4961 |
-
<path d="M 47.72
|
| 4962 |
</g>
|
| 4963 |
<g id="line2d_28">
|
| 4964 |
<g>
|
| 4965 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4966 |
</g>
|
| 4967 |
</g>
|
| 4968 |
<g id="text_28">
|
| 4969 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="ytick_5">
|
| 4973 |
<g id="grid-y--6" class="grid grid-y">
|
| 4974 |
-
<path d="M 47.72 42.
|
| 4975 |
</g>
|
| 4976 |
<g id="line2d_29">
|
| 4977 |
<g>
|
| 4978 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="42.
|
| 4979 |
</g>
|
| 4980 |
</g>
|
| 4981 |
<g id="text_29">
|
| 4982 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="label--y" class="ylabel">
|
|
@@ -4987,66 +4987,66 @@ Installed 37 packages in 239ms
|
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4990 |
-
<path d="M 83.325193 420.186871 L 114.286231
|
| 4991 |
<defs>
|
| 4992 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4993 |
</defs>
|
| 4994 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4995 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4996 |
-
<use ns4:href="#md7efaf3aec" x="114.286231" y="
|
| 4997 |
-
<use ns4:href="#md7efaf3aec" x="145.247268" y="
|
| 4998 |
-
<use ns4:href="#md7efaf3aec" x="176.208306" y="
|
| 4999 |
-
<use ns4:href="#md7efaf3aec" x="207.169343" y="
|
| 5000 |
-
<use ns4:href="#md7efaf3aec" x="238.130381" y="415.
|
| 5001 |
-
<use ns4:href="#md7efaf3aec" x="269.091418" y="
|
| 5002 |
-
<use ns4:href="#md7efaf3aec" x="300.052455" y="
|
| 5003 |
-
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.
|
| 5004 |
-
<use ns4:href="#md7efaf3aec" x="361.97453" y="
|
| 5005 |
-
<use ns4:href="#md7efaf3aec" x="392.935568" y="
|
| 5006 |
-
<use ns4:href="#md7efaf3aec" x="423.896605" y="
|
| 5007 |
-
<use ns4:href="#md7efaf3aec" x="454.857643" y="
|
| 5008 |
-
<use ns4:href="#md7efaf3aec" x="485.81868" y="
|
| 5009 |
-
<use ns4:href="#md7efaf3aec" x="516.779718" y="416.
|
| 5010 |
-
<use ns4:href="#md7efaf3aec" x="547.740755" y="
|
| 5011 |
-
<use ns4:href="#md7efaf3aec" x="578.701793" y="
|
| 5012 |
-
<use ns4:href="#md7efaf3aec" x="609.66283" y="
|
| 5013 |
-
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.
|
| 5014 |
-
<use ns4:href="#md7efaf3aec" x="671.584905" y="
|
| 5015 |
-
<use ns4:href="#md7efaf3aec" x="702.545943" y="
|
| 5016 |
-
<use ns4:href="#md7efaf3aec" x="733.50698" y="
|
| 5017 |
-
<use ns4:href="#md7efaf3aec" x="764.468018" y="
|
| 5018 |
-
<use ns4:href="#md7efaf3aec" x="795.429055" y="
|
| 5019 |
</g>
|
| 5020 |
</g>
|
| 5021 |
<g id="series--torch-eager" class="series">
|
| 5022 |
-
<path d="M 83.325193
|
| 5023 |
<defs>
|
| 5024 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5025 |
</defs>
|
| 5026 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5027 |
-
<use ns4:href="#m9b8c54d372" x="83.325193" y="
|
| 5028 |
-
<use ns4:href="#m9b8c54d372" x="114.286231" y="385.
|
| 5029 |
-
<use ns4:href="#m9b8c54d372" x="145.247268" y="
|
| 5030 |
-
<use ns4:href="#m9b8c54d372" x="176.208306" y="
|
| 5031 |
-
<use ns4:href="#m9b8c54d372" x="207.169343" y="
|
| 5032 |
-
<use ns4:href="#m9b8c54d372" x="238.130381" y="
|
| 5033 |
-
<use ns4:href="#m9b8c54d372" x="269.091418" y="
|
| 5034 |
-
<use ns4:href="#m9b8c54d372" x="300.052455" y="387.
|
| 5035 |
-
<use ns4:href="#m9b8c54d372" x="331.013493" y="
|
| 5036 |
-
<use ns4:href="#m9b8c54d372" x="361.97453" y="
|
| 5037 |
-
<use ns4:href="#m9b8c54d372" x="392.935568" y="
|
| 5038 |
-
<use ns4:href="#m9b8c54d372" x="423.896605" y="324.
|
| 5039 |
-
<use ns4:href="#m9b8c54d372" x="454.857643" y="388.
|
| 5040 |
-
<use ns4:href="#m9b8c54d372" x="485.81868" y="
|
| 5041 |
-
<use ns4:href="#m9b8c54d372" x="516.779718" y="
|
| 5042 |
-
<use ns4:href="#m9b8c54d372" x="547.740755" y="
|
| 5043 |
-
<use ns4:href="#m9b8c54d372" x="578.701793" y="
|
| 5044 |
-
<use ns4:href="#m9b8c54d372" x="609.66283" y="
|
| 5045 |
-
<use ns4:href="#m9b8c54d372" x="640.623868" y="387.
|
| 5046 |
-
<use ns4:href="#m9b8c54d372" x="671.584905" y="388.
|
| 5047 |
-
<use ns4:href="#m9b8c54d372" x="702.545943" y="
|
| 5048 |
-
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.
|
| 5049 |
-
<use ns4:href="#m9b8c54d372" x="764.468018" y="
|
| 5050 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5051 |
</g>
|
| 5052 |
</g>
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T04:15:07.150955</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
+
<path d="M 47.72 377.128985 L 831.034248 377.128985 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="377.128985" style="stroke: #000000; stroke-width: 0.8" />
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.928204" transform="rotate(-0 40.72 380.928204)">0.1</text>
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
+
<path d="M 47.72 293.371817 L 831.034248 293.371817 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="293.371817" style="stroke: #000000; stroke-width: 0.8" />
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.171035" transform="rotate(-0 40.72 297.171035)">0.2</text>
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
+
<path d="M 47.72 209.614648 L 831.034248 209.614648 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="209.614648" style="stroke: #000000; stroke-width: 0.8" />
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.413867" transform="rotate(-0 40.72 213.413867)">0.3</text>
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
+
<path d="M 47.72 125.857479 L 831.034248 125.857479 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="125.857479" style="stroke: #000000; stroke-width: 0.8" />
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.656698" transform="rotate(-0 40.72 129.656698)">0.4</text>
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
+
<path d="M 47.72 42.100311 L 831.034248 42.100311 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="42.100311" style="stroke: #000000; stroke-width: 0.8" />
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="45.89953" transform="rotate(-0 40.72 45.89953)">0.5</text>
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4290 |
+
<path d="M 83.325193 420.186871 L 114.286231 415.137151 L 145.247268 415.35492 L 176.208306 416.618815 L 207.169343 415.505682 L 238.130381 415.396798 L 269.091418 416.242746 L 300.052455 416.602901 L 331.013493 416.954681 L 361.97453 416.70341 L 392.935568 416.653156 L 423.896605 416.451301 L 454.857643 416.686659 L 485.81868 416.728537 L 516.779718 416.52752 L 547.740755 416.485641 L 578.701793 416.594526 L 609.66283 416.275411 L 640.623868 416.158151 L 671.584905 415.153902 L 702.545943 416.711786 L 733.50698 416.225994 L 764.468018 416.912803 L 795.429055 416.368381 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="415.137151" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4297 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="415.35492" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4298 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="416.618815" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4299 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="415.505682" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4300 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="415.396798" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4301 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="416.242746" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4302 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="416.602901" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4303 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.954681" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4304 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="416.70341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4305 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="416.653156" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4306 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.451301" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4307 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="416.686659" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4308 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="416.728537" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4309 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="416.52752" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4310 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="416.485641" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4311 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="416.594526" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4312 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="416.275411" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.158151" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4314 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="415.153902" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4315 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="416.711786" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4316 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="416.225994" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4317 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="416.912803" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4318 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="416.368381" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
</g>
|
| 4320 |
</g>
|
| 4321 |
<g id="series--torch-eager" class="series">
|
| 4322 |
+
<path d="M 83.325193 400.311295 L 114.286231 385.754299 L 145.247268 385.905062 L 176.208306 386.851518 L 207.169343 386.441107 L 238.130381 387.194922 L 269.091418 387.194922 L 300.052455 387.161419 L 331.013493 387.211673 L 361.97453 387.647211 L 392.935568 339.58651 L 423.896605 324.619104 L 454.857643 388.040869 L 485.81868 388.342395 L 516.779718 387.915234 L 547.740755 387.806349 L 578.701793 387.086038 L 609.66283 387.186546 L 640.623868 387.429442 L 671.584905 388.216759 L 702.545943 379.824291 L 733.50698 375.36841 L 764.468018 53.322934 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4323 |
<defs>
|
| 4324 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4325 |
</defs>
|
| 4326 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4327 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="400.311295" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4328 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="385.754299" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4329 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="385.905062" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4330 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="386.851518" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4331 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="386.441107" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4332 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="387.194922" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4333 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="387.194922" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4334 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="387.161419" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4335 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="387.211673" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4336 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="387.647211" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4337 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="339.58651" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4338 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="324.619104" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4339 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="388.040869" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4340 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="388.342395" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4341 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="387.915234" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4342 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="387.806349" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4343 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="387.086038" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4344 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="387.186546" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4345 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="387.429442" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4346 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="388.216759" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4347 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="379.824291" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4348 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.36841" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4349 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="53.322934" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4350 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4351 |
</g>
|
| 4352 |
</g>
|
|
|
|
| 4405 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4406 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4407 |
</span> |
|
| 4408 |
+
Cell: combine | 4.37s
|
| 4409 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4410 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4411 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4516 |
hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
|
| 4517 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
|
| 4518 |
hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
|
| 4519 |
+
torch_eager cuda_B2_D2048_S128_W2 0.09 True
|
| 4520 |
torch_eager cuda_B2_D2048_S128_W4 0.09 True
|
| 4521 |
+
torch_eager cuda_B2_D2048_S2048_W2 0.14 True
|
| 4522 |
torch_eager cuda_B2_D2048_S2048_W4 0.16 True
|
| 4523 |
+
torch_eager cuda_B2_D2048_S512_W2 0.09 True
|
| 4524 |
+
torch_eager cuda_B2_D2048_S512_W4 0.09 True
|
| 4525 |
torch_eager cuda_B2_D64_S128_W2 0.07 True
|
| 4526 |
torch_eager cuda_B2_D64_S128_W4 0.09 True
|
| 4527 |
torch_eager cuda_B2_D64_S2048_W2 0.09 True
|
| 4528 |
+
torch_eager cuda_B2_D64_S2048_W4 0.09 True
|
| 4529 |
torch_eager cuda_B2_D64_S512_W2 0.09 True
|
| 4530 |
torch_eager cuda_B2_D64_S512_W4 0.09 True
|
| 4531 |
torch_eager cuda_B4_D2048_S128_W2 0.09 True
|
| 4532 |
+
torch_eager cuda_B4_D2048_S128_W4 0.09 True
|
| 4533 |
torch_eager cuda_B4_D2048_S2048_W2 0.49 True
|
| 4534 |
torch_eager cuda_B4_D2048_S2048_W4 0.50 True
|
| 4535 |
+
torch_eager cuda_B4_D2048_S512_W2 0.10 True
|
| 4536 |
torch_eager cuda_B4_D2048_S512_W4 0.10 True
|
| 4537 |
+
torch_eager cuda_B4_D64_S128_W2 0.09 True
|
| 4538 |
+
torch_eager cuda_B4_D64_S128_W4 0.09 True
|
| 4539 |
+
torch_eager cuda_B4_D64_S2048_W2 0.09 True
|
| 4540 |
torch_eager cuda_B4_D64_S2048_W4 0.09 True
|
| 4541 |
+
torch_eager cuda_B4_D64_S512_W2 0.09 True
|
| 4542 |
+
torch_eager cuda_B4_D64_S512_W4 0.09 True
|
| 4543 |
|
| 4544 |
GENERATING COMBINED VISUALIZATION
|
| 4545 |
|
|
|
|
| 4559 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4560 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4561 |
<div class="uv-logs-content" style="display: none;">
|
| 4562 |
+
Installed 37 packages in 213ms
|
| 4563 |
</div>
|
| 4564 |
</div>
|
| 4565 |
<div class="cell-artifacts">
|
|
|
|
| 4572 |
<rdf:RDF>
|
| 4573 |
<ns2:Work>
|
| 4574 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4575 |
+
<dc:date>2025-10-29T04:15:07.150955</dc:date>
|
| 4576 |
<dc:format>image/svg+xml</dc:format>
|
| 4577 |
<dc:creator>
|
| 4578 |
<ns2:Agent>
|
|
|
|
| 4916 |
<g id="matplotlib.axis_2">
|
| 4917 |
<g id="ytick_1">
|
| 4918 |
<g id="grid-y--2" class="grid grid-y">
|
| 4919 |
+
<path d="M 47.72 377.128985 L 831.034248 377.128985 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4920 |
</g>
|
| 4921 |
<g id="line2d_25">
|
| 4922 |
<defs>
|
| 4923 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4924 |
</defs>
|
| 4925 |
<g>
|
| 4926 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="377.128985" style="stroke: #000000; stroke-width: 0.8" />
|
| 4927 |
</g>
|
| 4928 |
</g>
|
| 4929 |
<g id="text_25">
|
| 4930 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.928204" transform="rotate(-0 40.72 380.928204)">0.1</text>
|
| 4931 |
</g>
|
| 4932 |
</g>
|
| 4933 |
<g id="ytick_2">
|
| 4934 |
<g id="grid-y--3" class="grid grid-y">
|
| 4935 |
+
<path d="M 47.72 293.371817 L 831.034248 293.371817 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4936 |
</g>
|
| 4937 |
<g id="line2d_26">
|
| 4938 |
<g>
|
| 4939 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="293.371817" style="stroke: #000000; stroke-width: 0.8" />
|
| 4940 |
</g>
|
| 4941 |
</g>
|
| 4942 |
<g id="text_26">
|
| 4943 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.171035" transform="rotate(-0 40.72 297.171035)">0.2</text>
|
| 4944 |
</g>
|
| 4945 |
</g>
|
| 4946 |
<g id="ytick_3">
|
| 4947 |
<g id="grid-y--4" class="grid grid-y">
|
| 4948 |
+
<path d="M 47.72 209.614648 L 831.034248 209.614648 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4949 |
</g>
|
| 4950 |
<g id="line2d_27">
|
| 4951 |
<g>
|
| 4952 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="209.614648" style="stroke: #000000; stroke-width: 0.8" />
|
| 4953 |
</g>
|
| 4954 |
</g>
|
| 4955 |
<g id="text_27">
|
| 4956 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.413867" transform="rotate(-0 40.72 213.413867)">0.3</text>
|
| 4957 |
</g>
|
| 4958 |
</g>
|
| 4959 |
<g id="ytick_4">
|
| 4960 |
<g id="grid-y--5" class="grid grid-y">
|
| 4961 |
+
<path d="M 47.72 125.857479 L 831.034248 125.857479 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4962 |
</g>
|
| 4963 |
<g id="line2d_28">
|
| 4964 |
<g>
|
| 4965 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="125.857479" style="stroke: #000000; stroke-width: 0.8" />
|
| 4966 |
</g>
|
| 4967 |
</g>
|
| 4968 |
<g id="text_28">
|
| 4969 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.656698" transform="rotate(-0 40.72 129.656698)">0.4</text>
|
| 4970 |
</g>
|
| 4971 |
</g>
|
| 4972 |
<g id="ytick_5">
|
| 4973 |
<g id="grid-y--6" class="grid grid-y">
|
| 4974 |
+
<path d="M 47.72 42.100311 L 831.034248 42.100311 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4975 |
</g>
|
| 4976 |
<g id="line2d_29">
|
| 4977 |
<g>
|
| 4978 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="42.100311" style="stroke: #000000; stroke-width: 0.8" />
|
| 4979 |
</g>
|
| 4980 |
</g>
|
| 4981 |
<g id="text_29">
|
| 4982 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="45.89953" transform="rotate(-0 40.72 45.89953)">0.5</text>
|
| 4983 |
</g>
|
| 4984 |
</g>
|
| 4985 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4987 |
</g>
|
| 4988 |
</g>
|
| 4989 |
<g id="series--hf-kernels-causal-conv1d" class="series">
|
| 4990 |
+
<path d="M 83.325193 420.186871 L 114.286231 415.137151 L 145.247268 415.35492 L 176.208306 416.618815 L 207.169343 415.505682 L 238.130381 415.396798 L 269.091418 416.242746 L 300.052455 416.602901 L 331.013493 416.954681 L 361.97453 416.70341 L 392.935568 416.653156 L 423.896605 416.451301 L 454.857643 416.686659 L 485.81868 416.728537 L 516.779718 416.52752 L 547.740755 416.485641 L 578.701793 416.594526 L 609.66283 416.275411 L 640.623868 416.158151 L 671.584905 415.153902 L 702.545943 416.711786 L 733.50698 416.225994 L 764.468018 416.912803 L 795.429055 416.368381 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4991 |
<defs>
|
| 4992 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4993 |
</defs>
|
| 4994 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 4995 |
<use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4996 |
+
<use ns4:href="#md7efaf3aec" x="114.286231" y="415.137151" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4997 |
+
<use ns4:href="#md7efaf3aec" x="145.247268" y="415.35492" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4998 |
+
<use ns4:href="#md7efaf3aec" x="176.208306" y="416.618815" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4999 |
+
<use ns4:href="#md7efaf3aec" x="207.169343" y="415.505682" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5000 |
+
<use ns4:href="#md7efaf3aec" x="238.130381" y="415.396798" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5001 |
+
<use ns4:href="#md7efaf3aec" x="269.091418" y="416.242746" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5002 |
+
<use ns4:href="#md7efaf3aec" x="300.052455" y="416.602901" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5003 |
+
<use ns4:href="#md7efaf3aec" x="331.013493" y="416.954681" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5004 |
+
<use ns4:href="#md7efaf3aec" x="361.97453" y="416.70341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5005 |
+
<use ns4:href="#md7efaf3aec" x="392.935568" y="416.653156" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5006 |
+
<use ns4:href="#md7efaf3aec" x="423.896605" y="416.451301" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5007 |
+
<use ns4:href="#md7efaf3aec" x="454.857643" y="416.686659" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5008 |
+
<use ns4:href="#md7efaf3aec" x="485.81868" y="416.728537" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5009 |
+
<use ns4:href="#md7efaf3aec" x="516.779718" y="416.52752" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5010 |
+
<use ns4:href="#md7efaf3aec" x="547.740755" y="416.485641" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5011 |
+
<use ns4:href="#md7efaf3aec" x="578.701793" y="416.594526" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5012 |
+
<use ns4:href="#md7efaf3aec" x="609.66283" y="416.275411" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5013 |
+
<use ns4:href="#md7efaf3aec" x="640.623868" y="416.158151" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5014 |
+
<use ns4:href="#md7efaf3aec" x="671.584905" y="415.153902" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5015 |
+
<use ns4:href="#md7efaf3aec" x="702.545943" y="416.711786" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5016 |
+
<use ns4:href="#md7efaf3aec" x="733.50698" y="416.225994" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5017 |
+
<use ns4:href="#md7efaf3aec" x="764.468018" y="416.912803" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5018 |
+
<use ns4:href="#md7efaf3aec" x="795.429055" y="416.368381" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 5019 |
</g>
|
| 5020 |
</g>
|
| 5021 |
<g id="series--torch-eager" class="series">
|
| 5022 |
+
<path d="M 83.325193 400.311295 L 114.286231 385.754299 L 145.247268 385.905062 L 176.208306 386.851518 L 207.169343 386.441107 L 238.130381 387.194922 L 269.091418 387.194922 L 300.052455 387.161419 L 331.013493 387.211673 L 361.97453 387.647211 L 392.935568 339.58651 L 423.896605 324.619104 L 454.857643 388.040869 L 485.81868 388.342395 L 516.779718 387.915234 L 547.740755 387.806349 L 578.701793 387.086038 L 609.66283 387.186546 L 640.623868 387.429442 L 671.584905 388.216759 L 702.545943 379.824291 L 733.50698 375.36841 L 764.468018 53.322934 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 5023 |
<defs>
|
| 5024 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 5025 |
</defs>
|
| 5026 |
<g clip-path="url(#pb49fc4c8d2)">
|
| 5027 |
+
<use ns4:href="#m9b8c54d372" x="83.325193" y="400.311295" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5028 |
+
<use ns4:href="#m9b8c54d372" x="114.286231" y="385.754299" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5029 |
+
<use ns4:href="#m9b8c54d372" x="145.247268" y="385.905062" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5030 |
+
<use ns4:href="#m9b8c54d372" x="176.208306" y="386.851518" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5031 |
+
<use ns4:href="#m9b8c54d372" x="207.169343" y="386.441107" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5032 |
+
<use ns4:href="#m9b8c54d372" x="238.130381" y="387.194922" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5033 |
+
<use ns4:href="#m9b8c54d372" x="269.091418" y="387.194922" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5034 |
+
<use ns4:href="#m9b8c54d372" x="300.052455" y="387.161419" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5035 |
+
<use ns4:href="#m9b8c54d372" x="331.013493" y="387.211673" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5036 |
+
<use ns4:href="#m9b8c54d372" x="361.97453" y="387.647211" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5037 |
+
<use ns4:href="#m9b8c54d372" x="392.935568" y="339.58651" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5038 |
+
<use ns4:href="#m9b8c54d372" x="423.896605" y="324.619104" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5039 |
+
<use ns4:href="#m9b8c54d372" x="454.857643" y="388.040869" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5040 |
+
<use ns4:href="#m9b8c54d372" x="485.81868" y="388.342395" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5041 |
+
<use ns4:href="#m9b8c54d372" x="516.779718" y="387.915234" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5042 |
+
<use ns4:href="#m9b8c54d372" x="547.740755" y="387.806349" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5043 |
+
<use ns4:href="#m9b8c54d372" x="578.701793" y="387.086038" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5044 |
+
<use ns4:href="#m9b8c54d372" x="609.66283" y="387.186546" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5045 |
+
<use ns4:href="#m9b8c54d372" x="640.623868" y="387.429442" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5046 |
+
<use ns4:href="#m9b8c54d372" x="671.584905" y="388.216759" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5047 |
+
<use ns4:href="#m9b8c54d372" x="702.545943" y="379.824291" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5048 |
+
<use ns4:href="#m9b8c54d372" x="733.50698" y="375.36841" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5049 |
+
<use ns4:href="#m9b8c54d372" x="764.468018" y="53.322934" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5050 |
<use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 5051 |
</g>
|
| 5052 |
</g>
|
flash_attn/impls/artifacts/benchmark/attention.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2110259999644768, "p50": 1.2139859999820146, "p90": 1.214847000028385, "mean": 1.2134921999972903, "iqr": 0.002731000051880983, "raw_times": [1.2139859999820146, 1.212115999976504, 1.215486000035071, 1.214847000028385, 1.2110259999644768], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2041449999742326, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2489469999650282, "p50": 1.2594169999715632, "p90": 1.2745669999958409, "mean": 1.2628269999936492, "iqr": 0.020920999986628885, "raw_times": [1.2489469999650282, 1.2594169999715632, 1.253646000009212, 1.2745669999958409, 1.2775580000266018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2641869999470146, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2830869999902461, "p50": 1.288437999960479, "p90": 1.2899880000531994, "mean": 1.287595600001623, "iqr": 0.0053310000680539815, "raw_times": [1.2846569999851454, 1.2899880000531994, 1.288437999960479, 1.2918080000190457, 1.2830869999902461], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2736869999798728, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.3104980000093747, "p50": 1.3190589999680924, "p90": 1.3191280000341976, "mean": 1.3194864000070083, "iqr": 0.002650000055837154, "raw_times": [1.3104980000093747, 1.3322690000450166, 1.3190589999680924, 1.3164779999783605, 1.3191280000341976], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3112579999869922, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.470441000037681, "p50": 1.4753519999999298, "p90": 1.4777020000451557, "mean": 1.4751576000094246, "iqr": 0.00719000007620707, "raw_times": [1.4777020000451557, 1.4753519999999298, 1.481780999995408, 1.470441000037681, 1.4705119999689487], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.4788519999910932, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4914220000150635, "p50": 1.5084219999721427, "p90": 1.5101930000014363, "mean": 1.5035721999993257, "iqr": 0.01671100000066872, "raw_times": [1.4914220000150635, 1.4934820000007676, 1.5084219999721427, 1.5101930000014363, 1.5143420000072183], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.497162000021035, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "xformers",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,18 +12,18 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
import xformers.ops as xops
|
| 17 |
|
| 18 |
|
| 19 |
-
def
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
run_benchmark(
|
| 26 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 27 |
-
impl_name="
|
| 28 |
-
impl_tags={"family": "
|
| 29 |
-
impl_func=
|
| 30 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
+
def torch_flash(q, k, v):
|
| 18 |
+
qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
|
| 19 |
+
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
|
| 20 |
+
o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
|
| 21 |
+
return o.transpose(1, 2).contiguous()
|
| 22 |
|
| 23 |
|
| 24 |
run_benchmark(
|
| 25 |
kernel_type=KernelTypeEnum.ATTENTION,
|
| 26 |
+
impl_name="torch_flash_ma",
|
| 27 |
+
impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
|
| 28 |
+
impl_func=torch_flash,
|
| 29 |
)
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
-
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
|
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
-
| N/A
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
|
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 3976 |
-
torch_flash_ma 6.
|
| 3977 |
-
aten::scaled_dot_product_attention 0.
|
| 3978 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 3979 |
-
aten::_flash_attention_forward 0.
|
| 3980 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3981 |
-
aten::contiguous 0.
|
| 3982 |
-
aten::clone 0.
|
| 3983 |
-
aten::copy_ 1.
|
| 3984 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3985 |
-
Activity Buffer Request 27.
|
| 3986 |
-
aten::transpose 1.
|
| 3987 |
-
aten::as_strided 0.
|
| 3988 |
-
aten::empty_like 0.49% 25.
|
| 3989 |
-
aten::empty
|
| 3990 |
-
cudaLaunchKernel 2.
|
| 3991 |
-
aten::empty_strided 0.
|
| 3992 |
-
cudaDeviceGetAttribute 0.05% 2.
|
| 3993 |
-
cudaFuncSetAttribute 0.
|
| 3994 |
-
cudaDeviceSynchronize
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
-
Self CPU time total: 5.
|
| 3997 |
-
Self CUDA time total: 3.
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
-
torch_flash_ma
|
| 4008 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4009 |
-
aten::scaled_dot_product_attention 0.
|
| 4010 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4011 |
-
aten::_flash_attention_forward 0.
|
| 4012 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4013 |
-
aten::contiguous 0.
|
| 4014 |
-
aten::clone 0.
|
| 4015 |
-
aten::copy_ 1.
|
| 4016 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4017 |
-
Activity Buffer Request 27.
|
| 4018 |
-
aten::transpose
|
| 4019 |
-
aten::as_strided 0.33% 17.
|
| 4020 |
-
aten::empty_like 0.
|
| 4021 |
-
aten::empty 1.
|
| 4022 |
-
cudaLaunchKernel 2.
|
| 4023 |
-
aten::empty_strided 0.
|
| 4024 |
cudaDeviceGetAttribute 0.04% 1.900us 0.04% 1.900us 0.317us 0.000us 0.00% 0.000us 0.000us 6
|
| 4025 |
-
cudaFuncSetAttribute 0.
|
| 4026 |
-
cudaDeviceSynchronize 57.
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
-
Self CPU time total: 5.
|
| 4029 |
-
Self CUDA time total: 3.
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
-
torch_flash_ma
|
| 4040 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4041 |
-
aten::scaled_dot_product_attention 0.
|
| 4042 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4043 |
-
aten::_flash_attention_forward 0.
|
| 4044 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4045 |
-
aten::contiguous 0.
|
| 4046 |
-
aten::clone 0.
|
| 4047 |
-
aten::copy_ 1.
|
| 4048 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4049 |
-
Activity Buffer Request 26.
|
| 4050 |
-
aten::transpose 0.
|
| 4051 |
-
aten::as_strided 0.33% 17.
|
| 4052 |
-
aten::empty_like 0.
|
| 4053 |
-
aten::empty 1.
|
| 4054 |
-
cudaLaunchKernel
|
| 4055 |
-
aten::empty_strided 0.
|
| 4056 |
-
cudaDeviceGetAttribute 0.04%
|
| 4057 |
-
cudaFuncSetAttribute 0.
|
| 4058 |
-
cudaDeviceSynchronize 58.
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
-
Self CPU time total: 5.
|
| 4061 |
-
Self CUDA time total: 3.
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
torch_flash_ma 4.
|
| 4072 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4073 |
-
aten::scaled_dot_product_attention 0.
|
| 4074 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4075 |
-
aten::_flash_attention_forward 0.
|
| 4076 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4077 |
-
aten::contiguous 0.17% 9.
|
| 4078 |
-
aten::clone 0.
|
| 4079 |
-
aten::copy_ 1.
|
| 4080 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.
|
| 4081 |
-
Activity Buffer Request
|
| 4082 |
-
aten::transpose 0.
|
| 4083 |
-
aten::as_strided 0.31% 17.
|
| 4084 |
-
aten::empty_like 0.
|
| 4085 |
-
aten::empty 1.
|
| 4086 |
-
cudaLaunchKernel 5.
|
| 4087 |
-
aten::empty_strided 0.
|
| 4088 |
-
cudaDeviceGetAttribute 0.
|
| 4089 |
-
cudaFuncSetAttribute 0.07%
|
| 4090 |
-
cudaDeviceSynchronize 56.
|
| 4091 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
-
Self CPU time total: 5.
|
| 4093 |
-
Self CUDA time total: 3.
|
| 4094 |
|
| 4095 |
|
| 4096 |
|
|
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
|
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
-
torch_flash_ma 5.
|
| 4104 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4105 |
-
aten::scaled_dot_product_attention 0.
|
| 4106 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4107 |
-
aten::_flash_attention_forward 0.
|
| 4108 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4109 |
-
aten::contiguous 0.
|
| 4110 |
-
aten::clone 0.
|
| 4111 |
-
aten::copy_ 1.
|
| 4112 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4113 |
-
Activity Buffer Request 23.
|
| 4114 |
-
aten::transpose 0.
|
| 4115 |
-
aten::as_strided 0.
|
| 4116 |
-
aten::empty_like 0.
|
| 4117 |
-
aten::empty 1.
|
| 4118 |
-
cudaLaunchKernel 4.
|
| 4119 |
-
aten::empty_strided 0.
|
| 4120 |
-
cudaDeviceGetAttribute 0.
|
| 4121 |
-
cudaFuncSetAttribute 0.
|
| 4122 |
-
cudaDeviceSynchronize 59.
|
| 4123 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4124 |
-
Self CPU time total:
|
| 4125 |
-
Self CUDA time total: 4.
|
| 4126 |
|
| 4127 |
|
| 4128 |
|
|
@@ -4132,38 +4132,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
|
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
-
torch_flash_ma
|
| 4136 |
-
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4137 |
-
aten::scaled_dot_product_attention 0.
|
| 4138 |
-
aten::_scaled_dot_product_flash_attention 0.
|
| 4139 |
-
aten::_flash_attention_forward 0.
|
| 4140 |
-
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4141 |
-
aten::contiguous 0.
|
| 4142 |
-
aten::clone 0.
|
| 4143 |
-
aten::copy_ 1.
|
| 4144 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4145 |
-
Activity Buffer Request 23.
|
| 4146 |
-
aten::transpose 0.
|
| 4147 |
-
aten::as_strided 0.
|
| 4148 |
-
aten::empty_like 0.33%
|
| 4149 |
-
aten::empty 1.
|
| 4150 |
-
cudaLaunchKernel 4.
|
| 4151 |
-
aten::empty_strided 0.
|
| 4152 |
-
cudaDeviceGetAttribute 0.
|
| 4153 |
-
cudaFuncSetAttribute 0.06% 3.
|
| 4154 |
-
cudaDeviceSynchronize
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
-
Self CPU time total: 6.
|
| 4157 |
-
Self CUDA time total: 4.
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
| 4161 |
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4162 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4163 |
torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
|
| 4164 |
torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
|
| 4165 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4166 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4167 |
</pre></div>
|
| 4168 |
<div class="cell-artifacts">
|
| 4169 |
<h4>Artifacts:</h4>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.21s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3888 |
</div>
|
| 3889 |
</div>
|
| 3890 |
<div id="output-nv" class="cell-output">
|
| 3891 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 04:14:27 2025
|
| 3892 |
+-----------------------------------------------------------------------------------------+
|
| 3893 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3894 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3897 |
| | | MIG M. |
|
| 3898 |
|=========================================+========================+======================|
|
| 3899 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3900 |
+
| N/A 36C P0 80W / 350W | 0MiB / 46068MiB | 11% Default |
|
| 3901 |
| | | N/A |
|
| 3902 |
+-----------------------------------------+------------------------+----------------------+
|
| 3903 |
|
|
|
|
| 3972 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3973 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.590ms 102.17% 3.590ms 3.590ms 1
|
| 3976 |
+
torch_flash_ma 6.85% 354.470us 47.44% 2.454ms 2.454ms 0.000us 0.00% 3.554ms 3.554ms 1
|
| 3977 |
+
aten::scaled_dot_product_attention 0.84% 43.371us 4.38% 226.614us 75.538us 0.000us 0.00% 2.798ms 932.564us 3
|
| 3978 |
+
aten::_scaled_dot_product_flash_attention 0.52% 27.141us 3.54% 183.243us 61.081us 0.000us 0.00% 2.798ms 932.564us 3
|
| 3979 |
+
aten::_flash_attention_forward 0.84% 43.539us 2.59% 134.122us 44.707us 2.798ms 79.63% 2.798ms 932.564us 3
|
| 3980 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.798ms 79.63% 2.798ms 932.564us 3
|
| 3981 |
+
aten::contiguous 0.29% 14.889us 34.84% 1.803ms 150.217us 0.000us 0.00% 755.939us 62.995us 12
|
| 3982 |
+
aten::clone 0.79% 40.742us 34.56% 1.788ms 148.977us 0.000us 0.00% 755.939us 62.995us 12
|
| 3983 |
+
aten::copy_ 1.80% 93.020us 31.59% 1.634ms 136.197us 715.586us 20.37% 755.939us 62.995us 12
|
| 3984 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.586us 20.37% 715.586us 59.632us 12
|
| 3985 |
+
Activity Buffer Request 27.64% 1.430ms 27.64% 1.430ms 1.430ms 40.353us 1.15% 40.353us 40.353us 1
|
| 3986 |
+
aten::transpose 1.35% 70.048us 1.79% 92.780us 3.866us 0.000us 0.00% 0.000us 0.000us 24
|
| 3987 |
+
aten::as_strided 0.44% 22.732us 0.44% 22.732us 0.947us 0.000us 0.00% 0.000us 0.000us 24
|
| 3988 |
+
aten::empty_like 0.49% 25.480us 2.63% 136.134us 9.076us 0.000us 0.00% 0.000us 0.000us 15
|
| 3989 |
+
aten::empty 2.37% 122.383us 2.37% 122.383us 5.099us 0.000us 0.00% 0.000us 0.000us 24
|
| 3990 |
+
cudaLaunchKernel 2.63% 136.154us 2.63% 136.154us 9.077us 0.000us 0.00% 0.000us 0.000us 15
|
| 3991 |
+
aten::empty_strided 0.35% 17.861us 0.35% 17.861us 5.954us 0.000us 0.00% 0.000us 0.000us 3
|
| 3992 |
+
cudaDeviceGetAttribute 0.05% 2.732us 0.05% 2.732us 0.455us 0.000us 0.00% 0.000us 0.000us 6
|
| 3993 |
+
cudaFuncSetAttribute 0.19% 10.040us 0.19% 10.040us 3.347us 0.000us 0.00% 0.000us 0.000us 3
|
| 3994 |
+
cudaDeviceSynchronize 52.56% 2.719ms 52.56% 2.719ms 2.719ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
+
Self CPU time total: 5.174ms
|
| 3997 |
+
Self CUDA time total: 3.513ms
|
| 3998 |
|
| 3999 |
|
| 4000 |
|
|
|
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4006 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4007 |
+
torch_flash_ma 5.13% 269.966us 42.38% 2.232ms 2.232ms 0.000us 0.00% 3.778ms 3.778ms 1
|
| 4008 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.734ms 100.30% 3.734ms 3.734ms 1
|
| 4009 |
+
aten::scaled_dot_product_attention 0.51% 26.890us 3.58% 188.304us 62.768us 0.000us 0.00% 2.960ms 986.590us 3
|
| 4010 |
+
aten::_scaled_dot_product_flash_attention 0.35% 18.589us 3.07% 161.414us 53.805us 0.000us 0.00% 2.960ms 986.590us 3
|
| 4011 |
+
aten::_flash_attention_forward 0.78% 41.299us 2.29% 120.413us 40.138us 2.960ms 79.51% 2.960ms 986.590us 3
|
| 4012 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.960ms 79.51% 2.960ms 986.590us 3
|
| 4013 |
+
aten::contiguous 0.18% 9.501us 32.77% 1.726ms 143.802us 0.000us 0.00% 818.206us 68.184us 12
|
| 4014 |
+
aten::clone 0.54% 28.568us 32.59% 1.716ms 143.010us 0.000us 0.00% 818.206us 68.184us 12
|
| 4015 |
+
aten::copy_ 1.52% 80.181us 30.79% 1.621ms 135.119us 762.846us 20.49% 818.206us 68.184us 12
|
| 4016 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 762.846us 20.49% 762.846us 63.571us 12
|
| 4017 |
+
Activity Buffer Request 27.52% 1.449ms 27.52% 1.449ms 1.449ms 55.360us 1.49% 55.360us 55.360us 1
|
| 4018 |
+
aten::transpose 1.00% 52.915us 1.33% 70.084us 2.920us 0.000us 0.00% 0.000us 0.000us 24
|
| 4019 |
+
aten::as_strided 0.33% 17.169us 0.33% 17.169us 0.715us 0.000us 0.00% 0.000us 0.000us 24
|
| 4020 |
+
aten::empty_like 0.39% 20.652us 1.64% 86.425us 5.762us 0.000us 0.00% 0.000us 0.000us 15
|
| 4021 |
+
aten::empty 1.51% 79.433us 1.51% 79.433us 3.310us 0.000us 0.00% 0.000us 0.000us 24
|
| 4022 |
+
cudaLaunchKernel 2.18% 114.743us 2.18% 114.743us 7.650us 0.000us 0.00% 0.000us 0.000us 15
|
| 4023 |
+
aten::empty_strided 0.29% 15.331us 0.29% 15.331us 5.110us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
cudaDeviceGetAttribute 0.04% 1.900us 0.04% 1.900us 0.317us 0.000us 0.00% 0.000us 0.000us 6
|
| 4025 |
+
cudaFuncSetAttribute 0.10% 5.520us 0.10% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
|
| 4026 |
+
cudaDeviceSynchronize 57.62% 3.034ms 57.62% 3.034ms 3.034ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4027 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4028 |
+
Self CPU time total: 5.265ms
|
| 4029 |
+
Self CUDA time total: 3.723ms
|
| 4030 |
|
| 4031 |
|
| 4032 |
|
|
|
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4038 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4039 |
+
torch_flash_ma 5.04% 266.137us 41.64% 2.197ms 2.197ms 0.000us 0.00% 3.820ms 3.820ms 1
|
| 4040 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.772ms 100.29% 3.772ms 3.772ms 1
|
| 4041 |
+
aten::scaled_dot_product_attention 0.49% 25.880us 3.59% 189.194us 63.065us 0.000us 0.00% 2.983ms 994.205us 3
|
| 4042 |
+
aten::_scaled_dot_product_flash_attention 0.37% 19.363us 3.10% 163.314us 54.438us 0.000us 0.00% 2.983ms 994.205us 3
|
| 4043 |
+
aten::_flash_attention_forward 0.81% 42.782us 2.31% 121.862us 40.621us 2.983ms 79.31% 2.983ms 994.205us 3
|
| 4044 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.983ms 79.31% 2.983ms 994.205us 3
|
| 4045 |
+
aten::contiguous 0.18% 9.290us 32.12% 1.695ms 141.255us 0.000us 0.00% 836.990us 69.749us 12
|
| 4046 |
+
aten::clone 0.53% 27.791us 31.95% 1.686ms 140.481us 0.000us 0.00% 836.990us 69.749us 12
|
| 4047 |
+
aten::copy_ 1.57% 82.879us 30.22% 1.595ms 132.896us 778.238us 20.69% 836.990us 69.749us 12
|
| 4048 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 778.238us 20.69% 778.238us 64.853us 12
|
| 4049 |
+
Activity Buffer Request 26.92% 1.420ms 26.92% 1.420ms 1.420ms 58.752us 1.56% 58.752us 58.752us 1
|
| 4050 |
+
aten::transpose 0.98% 51.581us 1.30% 68.820us 2.868us 0.000us 0.00% 0.000us 0.000us 24
|
| 4051 |
+
aten::as_strided 0.33% 17.239us 0.33% 17.239us 0.718us 0.000us 0.00% 0.000us 0.000us 24
|
| 4052 |
+
aten::empty_like 0.35% 18.669us 1.58% 83.581us 5.572us 0.000us 0.00% 0.000us 0.000us 15
|
| 4053 |
+
aten::empty 1.49% 78.372us 1.49% 78.372us 3.265us 0.000us 0.00% 0.000us 0.000us 24
|
| 4054 |
+
cudaLaunchKernel 2.17% 114.523us 2.17% 114.523us 7.635us 0.000us 0.00% 0.000us 0.000us 15
|
| 4055 |
+
aten::empty_strided 0.29% 15.511us 0.29% 15.511us 5.170us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
cudaDeviceGetAttribute 0.04% 2.300us 0.04% 2.300us 0.383us 0.000us 0.00% 0.000us 0.000us 6
|
| 4057 |
+
cudaFuncSetAttribute 0.09% 4.560us 0.09% 4.560us 1.520us 0.000us 0.00% 0.000us 0.000us 3
|
| 4058 |
+
cudaDeviceSynchronize 58.36% 3.079ms 58.36% 3.079ms 3.079ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4059 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4060 |
+
Self CPU time total: 5.277ms
|
| 4061 |
+
Self CUDA time total: 3.761ms
|
| 4062 |
|
| 4063 |
|
| 4064 |
|
|
|
|
| 4068 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4069 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
torch_flash_ma 4.81% 269.664us 43.38% 2.432ms 2.432ms 0.000us 0.00% 3.921ms 3.921ms 1
|
| 4072 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.875ms 100.29% 3.875ms 3.875ms 1
|
| 4073 |
+
aten::scaled_dot_product_attention 0.47% 26.530us 3.32% 186.254us 62.085us 0.000us 0.00% 3.079ms 1.026ms 3
|
| 4074 |
+
aten::_scaled_dot_product_flash_attention 0.33% 18.670us 2.85% 159.724us 53.241us 0.000us 0.00% 3.079ms 1.026ms 3
|
| 4075 |
+
aten::_flash_attention_forward 0.73% 41.012us 2.12% 118.963us 39.654us 3.079ms 79.68% 3.079ms 1.026ms 3
|
| 4076 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.079ms 79.68% 3.079ms 1.026ms 3
|
| 4077 |
+
aten::contiguous 0.17% 9.411us 34.39% 1.928ms 160.703us 0.000us 0.00% 842.199us 70.183us 12
|
| 4078 |
+
aten::clone 0.52% 28.883us 34.22% 1.919ms 159.919us 0.000us 0.00% 842.199us 70.183us 12
|
| 4079 |
+
aten::copy_ 1.48% 82.822us 32.55% 1.825ms 152.123us 784.952us 20.32% 842.199us 70.183us 12
|
| 4080 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.952us 20.32% 784.952us 65.413us 12
|
| 4081 |
+
Activity Buffer Request 25.77% 1.445ms 25.77% 1.445ms 1.445ms 57.247us 1.48% 57.247us 57.247us 1
|
| 4082 |
+
aten::transpose 0.94% 52.967us 1.25% 70.184us 2.924us 0.000us 0.00% 0.000us 0.000us 24
|
| 4083 |
+
aten::as_strided 0.31% 17.217us 0.31% 17.217us 0.717us 0.000us 0.00% 0.000us 0.000us 24
|
| 4084 |
+
aten::empty_like 0.34% 19.178us 1.51% 84.829us 5.655us 0.000us 0.00% 0.000us 0.000us 15
|
| 4085 |
+
aten::empty 1.41% 78.973us 1.41% 78.973us 3.291us 0.000us 0.00% 0.000us 0.000us 24
|
| 4086 |
+
cudaLaunchKernel 5.72% 320.465us 5.72% 320.465us 21.364us 0.000us 0.00% 0.000us 0.000us 15
|
| 4087 |
+
aten::empty_strided 0.27% 15.229us 0.27% 15.229us 5.076us 0.000us 0.00% 0.000us 0.000us 3
|
| 4088 |
+
cudaDeviceGetAttribute 0.04% 2.110us 0.04% 2.110us 0.352us 0.000us 0.00% 0.000us 0.000us 6
|
| 4089 |
+
cudaFuncSetAttribute 0.07% 4.130us 0.07% 4.130us 1.377us 0.000us 0.00% 0.000us 0.000us 3
|
| 4090 |
+
cudaDeviceSynchronize 56.62% 3.175ms 56.62% 3.175ms 3.175ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4091 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4092 |
+
Self CPU time total: 5.607ms
|
| 4093 |
+
Self CUDA time total: 3.864ms
|
| 4094 |
|
| 4095 |
|
| 4096 |
|
|
|
|
| 4100 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4101 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4102 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4103 |
+
torch_flash_ma 5.31% 318.398us 40.52% 2.428ms 2.428ms 0.000us 0.00% 4.370ms 4.370ms 1
|
| 4104 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.321ms 100.25% 4.321ms 4.321ms 1
|
| 4105 |
+
aten::scaled_dot_product_attention 0.43% 25.890us 3.27% 195.733us 65.244us 0.000us 0.00% 3.503ms 1.168ms 3
|
| 4106 |
+
aten::_scaled_dot_product_flash_attention 0.32% 19.430us 2.83% 169.843us 56.614us 0.000us 0.00% 3.503ms 1.168ms 3
|
| 4107 |
+
aten::_flash_attention_forward 0.75% 44.733us 2.13% 127.534us 42.511us 3.503ms 81.28% 3.503ms 1.168ms 3
|
| 4108 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.503ms 81.28% 3.503ms 1.168ms 3
|
| 4109 |
+
aten::contiguous 0.16% 9.533us 31.15% 1.866ms 155.517us 0.000us 0.00% 867.131us 72.261us 12
|
| 4110 |
+
aten::clone 0.48% 28.649us 30.99% 1.857ms 154.722us 0.000us 0.00% 867.131us 72.261us 12
|
| 4111 |
+
aten::copy_ 1.37% 82.103us 29.43% 1.763ms 146.944us 806.940us 18.72% 867.131us 72.261us 12
|
| 4112 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 806.940us 18.72% 806.940us 67.245us 12
|
| 4113 |
+
Activity Buffer Request 23.90% 1.432ms 23.90% 1.432ms 1.432ms 60.191us 1.40% 60.191us 60.191us 1
|
| 4114 |
+
aten::transpose 0.87% 52.328us 1.17% 70.130us 2.922us 0.000us 0.00% 0.000us 0.000us 24
|
| 4115 |
+
aten::as_strided 0.30% 17.802us 0.30% 17.802us 0.742us 0.000us 0.00% 0.000us 0.000us 24
|
| 4116 |
+
aten::empty_like 0.33% 20.052us 1.44% 86.062us 5.737us 0.000us 0.00% 0.000us 0.000us 15
|
| 4117 |
+
aten::empty 1.32% 79.270us 1.32% 79.270us 3.303us 0.000us 0.00% 0.000us 0.000us 24
|
| 4118 |
+
cudaLaunchKernel 4.58% 274.314us 4.58% 274.314us 18.288us 0.000us 0.00% 0.000us 0.000us 15
|
| 4119 |
+
aten::empty_strided 0.27% 16.430us 0.27% 16.430us 5.477us 0.000us 0.00% 0.000us 0.000us 3
|
| 4120 |
+
cudaDeviceGetAttribute 0.04% 2.360us 0.04% 2.360us 0.393us 0.000us 0.00% 0.000us 0.000us 6
|
| 4121 |
+
cudaFuncSetAttribute 0.07% 4.210us 0.07% 4.210us 1.403us 0.000us 0.00% 0.000us 0.000us 3
|
| 4122 |
+
cudaDeviceSynchronize 59.48% 3.564ms 59.48% 3.564ms 3.564ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4123 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4124 |
+
Self CPU time total: 5.991ms
|
| 4125 |
+
Self CUDA time total: 4.310ms
|
| 4126 |
|
| 4127 |
|
| 4128 |
|
|
|
|
| 4132 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4133 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4134 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4135 |
+
torch_flash_ma 3.92% 237.516us 38.06% 2.305ms 2.305ms 0.000us 0.00% 4.487ms 4.487ms 1
|
| 4136 |
+
torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.438ms 100.25% 4.438ms 4.438ms 1
|
| 4137 |
+
aten::scaled_dot_product_attention 0.44% 26.369us 3.02% 182.943us 60.981us 0.000us 0.00% 3.605ms 1.202ms 3
|
| 4138 |
+
aten::_scaled_dot_product_flash_attention 0.31% 18.541us 2.59% 156.574us 52.191us 0.000us 0.00% 3.605ms 1.202ms 3
|
| 4139 |
+
aten::_flash_attention_forward 0.63% 38.112us 1.91% 115.882us 38.627us 3.605ms 81.43% 3.605ms 1.202ms 3
|
| 4140 |
+
void pytorch_flash::flash_fwd_kernel<Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.605ms 81.43% 3.605ms 1.202ms 3
|
| 4141 |
+
aten::contiguous 0.15% 9.281us 30.31% 1.836ms 153.003us 0.000us 0.00% 882.684us 73.557us 12
|
| 4142 |
+
aten::clone 0.47% 28.328us 30.16% 1.827ms 152.229us 0.000us 0.00% 882.684us 73.557us 12
|
| 4143 |
+
aten::copy_ 1.32% 79.871us 28.64% 1.734ms 144.531us 822.268us 18.57% 882.684us 73.557us 12
|
| 4144 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 822.268us 18.57% 822.268us 68.522us 12
|
| 4145 |
+
Activity Buffer Request 23.38% 1.416ms 23.38% 1.416ms 1.416ms 60.416us 1.36% 60.416us 60.416us 1
|
| 4146 |
+
aten::transpose 0.89% 53.992us 1.17% 70.941us 2.956us 0.000us 0.00% 0.000us 0.000us 24
|
| 4147 |
+
aten::as_strided 0.28% 16.949us 0.28% 16.949us 0.706us 0.000us 0.00% 0.000us 0.000us 24
|
| 4148 |
+
aten::empty_like 0.33% 19.985us 1.39% 84.474us 5.632us 0.000us 0.00% 0.000us 0.000us 15
|
| 4149 |
+
aten::empty 1.27% 76.679us 1.27% 76.679us 3.195us 0.000us 0.00% 0.000us 0.000us 24
|
| 4150 |
+
cudaLaunchKernel 4.33% 262.156us 4.33% 262.156us 17.477us 0.000us 0.00% 0.000us 0.000us 15
|
| 4151 |
+
aten::empty_strided 0.26% 15.620us 0.26% 15.620us 5.207us 0.000us 0.00% 0.000us 0.000us 3
|
| 4152 |
+
cudaDeviceGetAttribute 0.04% 2.329us 0.04% 2.329us 0.388us 0.000us 0.00% 0.000us 0.000us 6
|
| 4153 |
+
cudaFuncSetAttribute 0.06% 3.781us 0.06% 3.781us 1.260us 0.000us 0.00% 0.000us 0.000us 3
|
| 4154 |
+
cudaDeviceSynchronize 61.94% 3.751ms 61.94% 3.751ms 3.751ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4155 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4156 |
+
Self CPU time total: 6.057ms
|
| 4157 |
+
Self CUDA time total: 4.427ms
|
| 4158 |
|
| 4159 |
|
| 4160 |
impl wl p50(ms) ok
|
| 4161 |
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4162 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
|
| 4163 |
torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
|
| 4164 |
torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
|
| 4165 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
|
| 4166 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
|
| 4167 |
</pre></div>
|
| 4168 |
<div class="cell-artifacts">
|
| 4169 |
<h4>Artifacts:</h4>
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
-
hf_kernels_flash_attn 3.
|
| 3930 |
-
_flash_attn_9e27194::fwd 1.
|
| 3931 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3932 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3933 |
-
Activity Buffer Request 32.85% 1.
|
| 3934 |
-
cudaDeviceGetAttribute 0.11% 4.
|
| 3935 |
-
aten::empty_like 0.
|
| 3936 |
-
aten::empty_strided 0.
|
| 3937 |
-
aten::empty 0.
|
| 3938 |
-
cudaFuncSetAttribute 0.
|
| 3939 |
-
cudaLaunchKernel 0.
|
| 3940 |
-
cudaDeviceSynchronize 58.
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
-
Self CPU time total: 4.
|
| 3943 |
-
Self CUDA time total: 2.
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
-
hf_kernels_flash_attn
|
| 3954 |
-
_flash_attn_9e27194::fwd 1.
|
| 3955 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3956 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3957 |
-
Activity Buffer Request 31.
|
| 3958 |
-
cudaDeviceGetAttribute 0.
|
| 3959 |
-
aten::empty_like 0.
|
| 3960 |
-
aten::empty_strided 0.
|
| 3961 |
-
aten::empty 0.
|
| 3962 |
-
cudaFuncSetAttribute 0.
|
| 3963 |
-
cudaLaunchKernel 0.
|
| 3964 |
-
cudaDeviceSynchronize 63.
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
-
Self CPU time total: 4.
|
| 3967 |
-
Self CUDA time total: 2.
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
-
hf_kernels_flash_attn 2.
|
| 3978 |
-
_flash_attn_9e27194::fwd 1.
|
| 3979 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3980 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3981 |
-
Activity Buffer Request 31.
|
| 3982 |
-
cudaDeviceGetAttribute 0.
|
| 3983 |
-
aten::empty_like 0.16% 7.
|
| 3984 |
-
aten::empty_strided 0.
|
| 3985 |
-
aten::empty 0.
|
| 3986 |
-
cudaFuncSetAttribute 0.
|
| 3987 |
-
cudaLaunchKernel 0.
|
| 3988 |
-
cudaDeviceSynchronize 63.
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
Self CPU time total: 4.
|
| 3991 |
-
Self CUDA time total:
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
-
hf_kernels_flash_attn 2.
|
| 4002 |
-
_flash_attn_9e27194::fwd 1.
|
| 4003 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4004 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4005 |
-
Activity Buffer Request 29.
|
| 4006 |
-
cudaDeviceGetAttribute 0.08% 3.
|
| 4007 |
-
aten::empty_like 0.16% 7.
|
| 4008 |
-
aten::empty_strided 0.
|
| 4009 |
-
aten::empty 0.
|
| 4010 |
-
cudaFuncSetAttribute 0.08%
|
| 4011 |
-
cudaLaunchKernel 4.
|
| 4012 |
-
cudaDeviceSynchronize
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
-
Self CPU time total: 4.
|
| 4015 |
-
Self CUDA time total: 3.
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
-
hf_kernels_flash_attn 2.
|
| 4026 |
-
_flash_attn_9e27194::fwd 0.
|
| 4027 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4028 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4029 |
-
Activity Buffer Request 27.
|
| 4030 |
-
cudaDeviceGetAttribute 0.07% 3.
|
| 4031 |
-
aten::empty_like 0.
|
| 4032 |
-
aten::empty_strided 0.
|
| 4033 |
-
aten::empty 0.
|
| 4034 |
-
cudaFuncSetAttribute 0.
|
| 4035 |
-
cudaLaunchKernel 3.
|
| 4036 |
-
cudaDeviceSynchronize 65.
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
Self CPU time total: 5.
|
| 4039 |
-
Self CUDA time total: 3.
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
@@ -4046,35 +4046,35 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
hf_kernels_flash_attn 2.
|
| 4050 |
-
_flash_attn_9e27194::fwd 0.
|
| 4051 |
-
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4052 |
-
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4053 |
-
Activity Buffer Request 26.
|
| 4054 |
-
cudaDeviceGetAttribute 0.
|
| 4055 |
-
aten::empty_like 0.
|
| 4056 |
-
aten::empty_strided 0.
|
| 4057 |
-
aten::empty 0.
|
| 4058 |
-
cudaFuncSetAttribute 0.07% 3.
|
| 4059 |
-
cudaLaunchKernel 3.
|
| 4060 |
-
cudaDeviceSynchronize 66.
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
-
Self CPU time total: 5.
|
| 4063 |
-
Self CUDA time total: 3.
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
|
| 4068 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16
|
| 4069 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4070 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4071 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4072 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4073 |
</pre></div>
|
| 4074 |
<div class="cell-stderr">
|
| 4075 |
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4076 |
-
Fetching 20 files: 10%|█ | 2/20 [00:01<00:
|
| 4077 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00,
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
| 4080 |
<h4>Artifacts:</h4>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 6.00s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3928 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3929 |
+
hf_kernels_flash_attn 3.72% 162.003us 41.36% 1.801ms 1.801ms 0.000us 0.00% 3.718ms 3.718ms 1
|
| 3930 |
+
_flash_attn_9e27194::fwd 1.69% 73.411us 37.64% 1.639ms 546.409us 2.775ms 100.00% 3.718ms 1.239ms 3
|
| 3931 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.777ms 100.05% 2.777ms 2.777ms 1
|
| 3932 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.775ms 100.00% 2.775ms 925.087us 3
|
| 3933 |
+
Activity Buffer Request 32.85% 1.431ms 32.85% 1.431ms 1.431ms 943.102us 33.98% 943.102us 943.102us 1
|
| 3934 |
+
cudaDeviceGetAttribute 0.11% 4.701us 0.11% 4.701us 0.313us 0.000us 0.00% 0.000us 0.000us 15
|
| 3935 |
+
aten::empty_like 0.40% 17.630us 1.19% 51.921us 17.307us 0.000us 0.00% 0.000us 0.000us 3
|
| 3936 |
+
aten::empty_strided 0.79% 34.291us 0.79% 34.291us 11.430us 0.000us 0.00% 0.000us 0.000us 3
|
| 3937 |
+
aten::empty 0.58% 25.250us 0.58% 25.250us 2.806us 0.000us 0.00% 0.000us 0.000us 9
|
| 3938 |
+
cudaFuncSetAttribute 0.29% 12.441us 0.29% 12.441us 4.147us 0.000us 0.00% 0.000us 0.000us 3
|
| 3939 |
+
cudaLaunchKernel 0.94% 40.982us 0.94% 40.982us 13.661us 0.000us 0.00% 0.000us 0.000us 3
|
| 3940 |
+
cudaDeviceSynchronize 58.64% 2.554ms 58.64% 2.554ms 2.554ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3941 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3942 |
+
Self CPU time total: 4.355ms
|
| 3943 |
+
Self CUDA time total: 2.775ms
|
| 3944 |
|
| 3945 |
|
| 3946 |
|
|
|
|
| 3950 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3951 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3952 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3953 |
+
hf_kernels_flash_attn 2.15% 96.184us 36.83% 1.645ms 1.645ms 0.000us 0.00% 3.965ms 3.965ms 1
|
| 3954 |
+
_flash_attn_9e27194::fwd 1.07% 47.845us 34.67% 1.549ms 516.264us 2.974ms 100.00% 3.965ms 1.322ms 3
|
| 3955 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.975ms 100.05% 2.975ms 2.975ms 1
|
| 3956 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.974ms 100.00% 2.974ms 991.313us 3
|
| 3957 |
+
Activity Buffer Request 31.80% 1.421ms 31.80% 1.421ms 1.421ms 990.779us 33.32% 990.779us 990.779us 1
|
| 3958 |
+
cudaDeviceGetAttribute 0.08% 3.723us 0.08% 3.723us 0.248us 0.000us 0.00% 0.000us 0.000us 15
|
| 3959 |
+
aten::empty_like 0.15% 6.890us 0.53% 23.451us 7.817us 0.000us 0.00% 0.000us 0.000us 3
|
| 3960 |
+
aten::empty_strided 0.37% 16.561us 0.37% 16.561us 5.520us 0.000us 0.00% 0.000us 0.000us 3
|
| 3961 |
+
aten::empty 0.50% 22.171us 0.50% 22.171us 2.463us 0.000us 0.00% 0.000us 0.000us 9
|
| 3962 |
+
cudaFuncSetAttribute 0.09% 3.911us 0.09% 3.911us 1.304us 0.000us 0.00% 0.000us 0.000us 3
|
| 3963 |
+
cudaLaunchKernel 0.61% 27.040us 0.61% 27.040us 9.013us 0.000us 0.00% 0.000us 0.000us 3
|
| 3964 |
+
cudaDeviceSynchronize 63.17% 2.822ms 63.17% 2.822ms 2.822ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
+
Self CPU time total: 4.467ms
|
| 3967 |
+
Self CUDA time total: 2.974ms
|
| 3968 |
|
| 3969 |
|
| 3970 |
|
|
|
|
| 3974 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3975 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3976 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3977 |
+
hf_kernels_flash_attn 2.41% 109.001us 36.55% 1.652ms 1.652ms 0.000us 0.00% 4.036ms 4.036ms 1
|
| 3978 |
+
_flash_attn_9e27194::fwd 1.11% 50.180us 34.14% 1.543ms 514.365us 3.018ms 100.00% 4.036ms 1.345ms 3
|
| 3979 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.019ms 100.05% 3.019ms 3.019ms 1
|
| 3980 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.018ms 100.00% 3.018ms 1.006ms 3
|
| 3981 |
+
Activity Buffer Request 31.22% 1.411ms 31.22% 1.411ms 1.411ms 1.018ms 33.73% 1.018ms 1.018ms 1
|
| 3982 |
+
cudaDeviceGetAttribute 0.08% 3.790us 0.08% 3.790us 0.253us 0.000us 0.00% 0.000us 0.000us 15
|
| 3983 |
+
aten::empty_like 0.16% 7.151us 0.52% 23.401us 7.800us 0.000us 0.00% 0.000us 0.000us 3
|
| 3984 |
+
aten::empty_strided 0.36% 16.250us 0.36% 16.250us 5.417us 0.000us 0.00% 0.000us 0.000us 3
|
| 3985 |
+
aten::empty 0.48% 21.660us 0.48% 21.660us 2.407us 0.000us 0.00% 0.000us 0.000us 9
|
| 3986 |
+
cudaFuncSetAttribute 0.10% 4.380us 0.10% 4.380us 1.460us 0.000us 0.00% 0.000us 0.000us 3
|
| 3987 |
+
cudaLaunchKernel 0.64% 28.812us 0.64% 28.812us 9.604us 0.000us 0.00% 0.000us 0.000us 3
|
| 3988 |
+
cudaDeviceSynchronize 63.45% 2.868ms 63.45% 2.868ms 2.868ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
Self CPU time total: 4.520ms
|
| 3991 |
+
Self CUDA time total: 3.018ms
|
| 3992 |
|
| 3993 |
|
| 3994 |
|
|
|
|
| 3998 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3999 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
hf_kernels_flash_attn 2.47% 118.264us 38.70% 1.854ms 1.854ms 0.000us 0.00% 4.130ms 4.130ms 1
|
| 4002 |
+
_flash_attn_9e27194::fwd 1.01% 48.470us 36.23% 1.735ms 578.465us 3.094ms 100.00% 4.130ms 1.377ms 3
|
| 4003 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.05% 3.096ms 3.096ms 1
|
| 4004 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 100.00% 3.094ms 1.031ms 3
|
| 4005 |
+
Activity Buffer Request 29.33% 1.405ms 29.33% 1.405ms 1.405ms 1.036ms 33.49% 1.036ms 1.036ms 1
|
| 4006 |
+
cudaDeviceGetAttribute 0.08% 3.720us 0.08% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15
|
| 4007 |
+
aten::empty_like 0.16% 7.520us 0.53% 25.440us 8.480us 0.000us 0.00% 0.000us 0.000us 3
|
| 4008 |
+
aten::empty_strided 0.37% 17.920us 0.37% 17.920us 5.973us 0.000us 0.00% 0.000us 0.000us 3
|
| 4009 |
+
aten::empty 0.43% 20.670us 0.43% 20.670us 2.297us 0.000us 0.00% 0.000us 0.000us 9
|
| 4010 |
+
cudaFuncSetAttribute 0.08% 4.010us 0.08% 4.010us 1.337us 0.000us 0.00% 0.000us 0.000us 3
|
| 4011 |
+
cudaLaunchKernel 4.76% 227.935us 4.76% 227.935us 75.978us 0.000us 0.00% 0.000us 0.000us 3
|
| 4012 |
+
cudaDeviceSynchronize 61.30% 2.937ms 61.30% 2.937ms 2.937ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
+
Self CPU time total: 4.790ms
|
| 4015 |
+
Self CUDA time total: 3.094ms
|
| 4016 |
|
| 4017 |
|
| 4018 |
|
|
|
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4024 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4025 |
+
hf_kernels_flash_attn 2.07% 110.462us 34.39% 1.835ms 1.835ms 0.000us 0.00% 4.876ms 4.876ms 1
|
| 4026 |
+
_flash_attn_9e27194::fwd 0.91% 48.552us 32.32% 1.724ms 574.769us 3.652ms 100.00% 4.876ms 1.625ms 3
|
| 4027 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.05% 3.654ms 3.654ms 1
|
| 4028 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
|
| 4029 |
+
Activity Buffer Request 27.00% 1.440ms 27.00% 1.440ms 1.440ms 1.224ms 33.53% 1.224ms 1.224ms 1
|
| 4030 |
+
cudaDeviceGetAttribute 0.07% 3.831us 0.07% 3.831us 0.255us 0.000us 0.00% 0.000us 0.000us 15
|
| 4031 |
+
aten::empty_like 0.15% 7.880us 0.47% 24.970us 8.323us 0.000us 0.00% 0.000us 0.000us 3
|
| 4032 |
+
aten::empty_strided 0.32% 17.090us 0.32% 17.090us 5.697us 0.000us 0.00% 0.000us 0.000us 3
|
| 4033 |
+
aten::empty 0.44% 23.410us 0.44% 23.410us 2.601us 0.000us 0.00% 0.000us 0.000us 9
|
| 4034 |
+
cudaFuncSetAttribute 0.08% 4.110us 0.08% 4.110us 1.370us 0.000us 0.00% 0.000us 0.000us 3
|
| 4035 |
+
cudaLaunchKernel 3.36% 179.284us 3.36% 179.284us 59.761us 0.000us 0.00% 0.000us 0.000us 3
|
| 4036 |
+
cudaDeviceSynchronize 65.61% 3.500ms 65.61% 3.500ms 3.500ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
Self CPU time total: 5.335ms
|
| 4039 |
+
Self CUDA time total: 3.652ms
|
| 4040 |
|
| 4041 |
|
| 4042 |
|
|
|
|
| 4046 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4047 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
hf_kernels_flash_attn 2.06% 108.982us 33.74% 1.784ms 1.784ms 0.000us 0.00% 4.883ms 4.883ms 1
|
| 4050 |
+
_flash_attn_9e27194::fwd 0.92% 48.842us 31.68% 1.675ms 558.369us 3.652ms 100.00% 4.883ms 1.628ms 3
|
| 4051 |
+
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.04% 3.654ms 3.654ms 1
|
| 4052 |
+
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
|
| 4053 |
+
Activity Buffer Request 26.57% 1.405ms 26.57% 1.405ms 1.405ms 1.231ms 33.70% 1.231ms 1.231ms 1
|
| 4054 |
+
cudaDeviceGetAttribute 0.07% 3.720us 0.07% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15
|
| 4055 |
+
aten::empty_like 0.14% 7.460us 0.45% 23.940us 7.980us 0.000us 0.00% 0.000us 0.000us 3
|
| 4056 |
+
aten::empty_strided 0.31% 16.480us 0.31% 16.480us 5.493us 0.000us 0.00% 0.000us 0.000us 3
|
| 4057 |
+
aten::empty 0.43% 22.601us 0.43% 22.601us 2.511us 0.000us 0.00% 0.000us 0.000us 9
|
| 4058 |
+
cudaFuncSetAttribute 0.07% 3.610us 0.07% 3.610us 1.203us 0.000us 0.00% 0.000us 0.000us 3
|
| 4059 |
+
cudaLaunchKernel 3.17% 167.603us 3.17% 167.603us 55.868us 0.000us 0.00% 0.000us 0.000us 3
|
| 4060 |
+
cudaDeviceSynchronize 66.26% 3.504ms 66.26% 3.504ms 3.504ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4061 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4062 |
+
Self CPU time total: 5.288ms
|
| 4063 |
+
Self CUDA time total: 3.652ms
|
| 4064 |
|
| 4065 |
|
| 4066 |
impl wl p50(ms) ok
|
| 4067 |
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
|
| 4068 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
|
| 4069 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4070 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
|
| 4071 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
|
| 4072 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
|
| 4073 |
</pre></div>
|
| 4074 |
<div class="cell-stderr">
|
| 4075 |
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4076 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:14, 1.21it/s]
|
| 4077 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 12.08it/s]
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
| 4080 |
<h4>Artifacts:</h4>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark"
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
-
hf_kernels_flash_attn3 3.
|
| 3929 |
-
FlashAttnFunc 3.
|
| 3930 |
-
_flash_attn3_48fe103_dirty::fwd 1.86% 80.
|
| 3931 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3932 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3933 |
-
Activity Buffer Request
|
| 3934 |
-
aten::empty 1.
|
| 3935 |
-
cudaFuncSetAttribute 0.30%
|
| 3936 |
-
cudaLaunchKernel 1.
|
| 3937 |
-
cudaDeviceSynchronize
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total: 4.
|
| 3940 |
-
Self CUDA time total: 2.
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
hf_kernels_flash_attn3 2.
|
| 3951 |
-
FlashAttnFunc 2.
|
| 3952 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3953 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3955 |
-
Activity Buffer Request 32.
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
cudaDeviceSynchronize 59.
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
-
Self CPU time total: 4.
|
| 3962 |
-
Self CUDA time total: 2.
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
-
hf_kernels_flash_attn3 2.
|
| 3973 |
-
FlashAttnFunc 2.
|
| 3974 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3975 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3976 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3977 |
-
Activity Buffer Request
|
| 3978 |
-
aten::empty 0.
|
| 3979 |
-
cudaFuncSetAttribute 0.12% 5.
|
| 3980 |
-
cudaLaunchKernel 0.
|
| 3981 |
-
cudaDeviceSynchronize
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
-
Self CPU time total: 4.
|
| 3984 |
-
Self CUDA time total: 2.
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
-
hf_kernels_flash_attn3 2.
|
| 3995 |
-
FlashAttnFunc
|
| 3996 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 3997 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3998 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request
|
| 4000 |
-
aten::empty 0.
|
| 4001 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4002 |
-
cudaLaunchKernel 5.
|
| 4003 |
-
cudaDeviceSynchronize
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
-
Self CPU time total: 4.
|
| 4006 |
-
Self CUDA time total:
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
-
hf_kernels_flash_attn3 2.
|
| 4017 |
-
FlashAttnFunc 1.
|
| 4018 |
-
_flash_attn3_48fe103_dirty::fwd
|
| 4019 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4020 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4021 |
-
Activity Buffer Request 27.
|
| 4022 |
-
aten::empty 0.
|
| 4023 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4024 |
-
cudaLaunchKernel 3.
|
| 4025 |
-
cudaDeviceSynchronize 62.
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
-
Self CPU time total: 5.
|
| 4028 |
-
Self CUDA time total: 3.
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
@@ -4035,40 +4035,35 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
-
hf_kernels_flash_attn3 2.
|
| 4039 |
-
FlashAttnFunc 1.
|
| 4040 |
-
_flash_attn3_48fe103_dirty::fwd 1.
|
| 4041 |
-
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4042 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4043 |
-
Activity Buffer Request 27.
|
| 4044 |
-
aten::empty 0.
|
| 4045 |
-
cudaFuncSetAttribute 0.10% 5.
|
| 4046 |
-
cudaLaunchKernel 3.
|
| 4047 |
-
cudaDeviceSynchronize 63.
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
-
Self CPU time total: 5.
|
| 4050 |
-
Self CUDA time total: 3.
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
|
| 4055 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4056 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4057 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4058 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4059 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.
|
| 4060 |
</pre></div>
|
| 4061 |
-
<div class="
|
| 4062 |
-
|
| 4063 |
-
|
| 4064 |
-
|
| 4065 |
-
|
| 4066 |
-
Installed 15 packages in 15ms
|
| 4067 |
</div>
|
| 4068 |
-
</div>
|
| 4069 |
-
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4070 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.06it/s]
|
| 4071 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.12it/s]</div>
|
| 4072 |
<div class="cell-artifacts">
|
| 4073 |
<h4>Artifacts:</h4>
|
| 4074 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 5.51s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3927 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3928 |
+
hf_kernels_flash_attn3 3.90% 167.264us 45.75% 1.964ms 1.964ms 0.000us 0.00% 3.551ms 3.551ms 1
|
| 3929 |
+
FlashAttnFunc 3.34% 143.492us 41.85% 1.797ms 598.836us 0.000us 0.00% 3.551ms 1.184ms 3
|
| 3930 |
+
_flash_attn3_48fe103_dirty::fwd 1.86% 80.044us 38.51% 1.653ms 551.005us 2.654ms 100.00% 3.551ms 1.184ms 3
|
| 3931 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.655ms 100.05% 2.655ms 2.655ms 1
|
| 3932 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.654ms 100.00% 2.654ms 884.532us 3
|
| 3933 |
+
Activity Buffer Request 34.19% 1.468ms 34.19% 1.468ms 1.468ms 897.822us 33.83% 897.822us 897.822us 1
|
| 3934 |
+
aten::empty 1.09% 46.590us 1.09% 46.590us 7.765us 0.000us 0.00% 0.000us 0.000us 6
|
| 3935 |
+
cudaFuncSetAttribute 0.30% 12.680us 0.30% 12.680us 4.227us 0.000us 0.00% 0.000us 0.000us 3
|
| 3936 |
+
cudaLaunchKernel 1.07% 45.911us 1.07% 45.911us 15.304us 0.000us 0.00% 0.000us 0.000us 3
|
| 3937 |
+
cudaDeviceSynchronize 54.25% 2.329ms 54.25% 2.329ms 2.329ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
+
Self CPU time total: 4.293ms
|
| 3940 |
+
Self CUDA time total: 2.654ms
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
hf_kernels_flash_attn3 2.52% 108.973us 40.29% 1.745ms 1.745ms 0.000us 0.00% 3.761ms 3.761ms 1
|
| 3951 |
+
FlashAttnFunc 2.11% 91.250us 37.77% 1.636ms 545.408us 0.000us 0.00% 3.761ms 1.254ms 3
|
| 3952 |
+
_flash_attn3_48fe103_dirty::fwd 1.23% 53.414us 35.67% 1.545ms 514.991us 2.811ms 100.00% 3.761ms 1.254ms 3
|
| 3953 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.813ms 100.05% 2.813ms 2.813ms 1
|
| 3954 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.811ms 100.00% 2.811ms 937.084us 3
|
| 3955 |
+
Activity Buffer Request 32.99% 1.429ms 32.99% 1.429ms 1.429ms 949.852us 33.79% 949.852us 949.852us 1
|
| 3956 |
+
aten::empty 0.64% 27.630us 0.64% 27.630us 4.605us 0.000us 0.00% 0.000us 0.000us 6
|
| 3957 |
+
cudaFuncSetAttribute 0.11% 4.980us 0.11% 4.980us 1.660us 0.000us 0.00% 0.000us 0.000us 3
|
| 3958 |
+
cudaLaunchKernel 0.69% 30.020us 0.69% 30.020us 10.007us 0.000us 0.00% 0.000us 0.000us 3
|
| 3959 |
+
cudaDeviceSynchronize 59.71% 2.587ms 59.71% 2.587ms 2.587ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3960 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3961 |
+
Self CPU time total: 4.332ms
|
| 3962 |
+
Self CUDA time total: 2.811ms
|
| 3963 |
|
| 3964 |
|
| 3965 |
|
|
|
|
| 3969 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3970 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
+
hf_kernels_flash_attn3 2.50% 112.343us 38.92% 1.748ms 1.748ms 0.000us 0.00% 3.960ms 3.960ms 1
|
| 3973 |
+
FlashAttnFunc 2.05% 91.871us 36.42% 1.636ms 545.325us 0.000us 0.00% 3.960ms 1.320ms 3
|
| 3974 |
+
_flash_attn3_48fe103_dirty::fwd 1.14% 51.221us 34.37% 1.544ms 514.701us 2.972ms 100.00% 3.960ms 1.320ms 3
|
| 3975 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.973ms 100.05% 2.973ms 2.973ms 1
|
| 3976 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.972ms 100.00% 2.972ms 990.630us 3
|
| 3977 |
+
Activity Buffer Request 31.81% 1.429ms 31.81% 1.429ms 1.429ms 987.835us 33.24% 987.835us 987.835us 1
|
| 3978 |
+
aten::empty 0.63% 28.400us 0.63% 28.400us 4.733us 0.000us 0.00% 0.000us 0.000us 6
|
| 3979 |
+
cudaFuncSetAttribute 0.12% 5.211us 0.12% 5.211us 1.737us 0.000us 0.00% 0.000us 0.000us 3
|
| 3980 |
+
cudaLaunchKernel 0.67% 30.301us 0.67% 30.301us 10.100us 0.000us 0.00% 0.000us 0.000us 3
|
| 3981 |
+
cudaDeviceSynchronize 61.08% 2.744ms 61.08% 2.744ms 2.744ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3982 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3983 |
+
Self CPU time total: 4.492ms
|
| 3984 |
+
Self CUDA time total: 2.972ms
|
| 3985 |
|
| 3986 |
|
| 3987 |
|
|
|
|
| 3991 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3992 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3993 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3994 |
+
hf_kernels_flash_attn3 2.16% 102.333us 40.97% 1.945ms 1.945ms 0.000us 0.00% 4.045ms 4.045ms 1
|
| 3995 |
+
FlashAttnFunc 1.95% 92.400us 38.81% 1.843ms 614.206us 0.000us 0.00% 4.045ms 1.348ms 3
|
| 3996 |
+
_flash_attn3_48fe103_dirty::fwd 1.07% 50.872us 36.87% 1.750ms 583.406us 3.024ms 100.00% 4.045ms 1.348ms 3
|
| 3997 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.026ms 100.05% 3.026ms 3.026ms 1
|
| 3998 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.024ms 100.00% 3.024ms 1.008ms 3
|
| 3999 |
+
Activity Buffer Request 29.88% 1.419ms 29.88% 1.419ms 1.419ms 1.021ms 33.76% 1.021ms 1.021ms 1
|
| 4000 |
+
aten::empty 0.61% 28.961us 0.61% 28.961us 4.827us 0.000us 0.00% 0.000us 0.000us 6
|
| 4001 |
+
cudaFuncSetAttribute 0.11% 5.320us 0.11% 5.320us 1.773us 0.000us 0.00% 0.000us 0.000us 3
|
| 4002 |
+
cudaLaunchKernel 5.19% 246.415us 5.19% 246.415us 82.138us 0.000us 0.00% 0.000us 0.000us 3
|
| 4003 |
+
cudaDeviceSynchronize 59.03% 2.803ms 59.03% 2.803ms 2.803ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4004 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4005 |
+
Self CPU time total: 4.747ms
|
| 4006 |
+
Self CUDA time total: 3.024ms
|
| 4007 |
|
| 4008 |
|
| 4009 |
|
|
|
|
| 4013 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4014 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4015 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4016 |
+
hf_kernels_flash_attn3 2.48% 128.541us 37.35% 1.936ms 1.936ms 0.000us 0.00% 4.636ms 4.636ms 1
|
| 4017 |
+
FlashAttnFunc 1.81% 93.984us 34.87% 1.807ms 602.493us 0.000us 0.00% 4.636ms 1.545ms 3
|
| 4018 |
+
_flash_attn3_48fe103_dirty::fwd 0.96% 49.852us 33.05% 1.713ms 571.165us 3.473ms 100.00% 4.636ms 1.545ms 3
|
| 4019 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.475ms 100.05% 3.475ms 3.475ms 1
|
| 4020 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.473ms 100.00% 3.473ms 1.158ms 3
|
| 4021 |
+
Activity Buffer Request 27.80% 1.441ms 27.80% 1.441ms 1.441ms 1.163ms 33.49% 1.163ms 1.163ms 1
|
| 4022 |
+
aten::empty 0.57% 29.640us 0.57% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6
|
| 4023 |
+
cudaFuncSetAttribute 0.10% 5.160us 0.10% 5.160us 1.720us 0.000us 0.00% 0.000us 0.000us 3
|
| 4024 |
+
cudaLaunchKernel 3.62% 187.873us 3.62% 187.873us 62.624us 0.000us 0.00% 0.000us 0.000us 3
|
| 4025 |
+
cudaDeviceSynchronize 62.65% 3.248ms 62.65% 3.248ms 3.248ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4026 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4027 |
+
Self CPU time total: 5.184ms
|
| 4028 |
+
Self CUDA time total: 3.473ms
|
| 4029 |
|
| 4030 |
|
| 4031 |
|
|
|
|
| 4035 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4036 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4037 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4038 |
+
hf_kernels_flash_attn3 2.41% 121.192us 36.39% 1.829ms 1.829ms 0.000us 0.00% 4.566ms 4.566ms 1
|
| 4039 |
+
FlashAttnFunc 1.84% 92.271us 33.97% 1.707ms 569.139us 0.000us 0.00% 4.566ms 1.522ms 3
|
| 4040 |
+
_flash_attn3_48fe103_dirty::fwd 1.00% 50.242us 32.14% 1.615ms 538.382us 3.416ms 100.00% 4.566ms 1.522ms 3
|
| 4041 |
+
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.417ms 100.04% 3.417ms 3.417ms 1
|
| 4042 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.416ms 100.00% 3.416ms 1.139ms 3
|
| 4043 |
+
Activity Buffer Request 27.08% 1.361ms 27.08% 1.361ms 1.361ms 1.150ms 33.68% 1.150ms 1.150ms 1
|
| 4044 |
+
aten::empty 0.60% 30.030us 0.60% 30.030us 5.005us 0.000us 0.00% 0.000us 0.000us 6
|
| 4045 |
+
cudaFuncSetAttribute 0.10% 5.061us 0.10% 5.061us 1.687us 0.000us 0.00% 0.000us 0.000us 3
|
| 4046 |
+
cudaLaunchKernel 3.36% 168.913us 3.36% 168.913us 56.304us 0.000us 0.00% 0.000us 0.000us 3
|
| 4047 |
+
cudaDeviceSynchronize 63.61% 3.197ms 63.61% 3.197ms 3.197ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
+
Self CPU time total: 5.026ms
|
| 4050 |
+
Self CUDA time total: 3.416ms
|
| 4051 |
|
| 4052 |
|
| 4053 |
impl wl p50(ms) ok
|
| 4054 |
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
|
| 4055 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
|
| 4056 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
|
| 4057 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
|
| 4058 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
|
| 4059 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4060 |
</pre></div>
|
| 4061 |
+
<div class="cell-stderr">
|
| 4062 |
+
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4063 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 9.18it/s]
|
| 4064 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.23it/s]
|
| 4065 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.82it/s]
|
|
|
|
| 4066 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4067 |
<div class="cell-artifacts">
|
| 4068 |
<h4>Artifacts:</h4>
|
| 4069 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark"
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
-
torch_mem_eff 4.
|
| 3928 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3929 |
-
aten::scaled_dot_product_attention 0.
|
| 3930 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3931 |
-
aten::_efficient_attention_forward 0.
|
| 3932 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3933 |
-
aten::contiguous 0.
|
| 3934 |
-
aten::clone 0.48% 34.
|
| 3935 |
-
aten::copy_ 1.
|
| 3936 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3937 |
-
Activity Buffer Request
|
| 3938 |
-
aten::transpose
|
| 3939 |
-
aten::as_strided 0.
|
| 3940 |
-
aten::empty_like 0.
|
| 3941 |
-
aten::empty 1.
|
| 3942 |
-
cudaLaunchKernel 1.
|
| 3943 |
-
cudaStreamIsCapturing 0.04% 3.
|
| 3944 |
-
cudaFuncSetAttribute 0.
|
| 3945 |
-
cudaDeviceSynchronize
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
-
Self CPU time total: 7.
|
| 3948 |
-
Self CUDA time total: 5.
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
-
torch_mem_eff 3.
|
| 3959 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3960 |
-
aten::scaled_dot_product_attention 0.
|
| 3961 |
-
aten::_scaled_dot_product_efficient_attention 0.25%
|
| 3962 |
-
aten::_efficient_attention_forward 0.
|
| 3963 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3964 |
-
aten::contiguous 0.10% 7.
|
| 3965 |
-
aten::clone 0.
|
| 3966 |
-
aten::copy_ 0.
|
| 3967 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3968 |
-
Activity Buffer Request
|
| 3969 |
-
aten::transpose 0.
|
| 3970 |
-
aten::as_strided 0.
|
| 3971 |
-
aten::empty_like 0.16% 11.
|
| 3972 |
-
aten::empty 0.
|
| 3973 |
-
cudaLaunchKernel 1.
|
| 3974 |
-
cudaStreamIsCapturing 0.
|
| 3975 |
-
cudaFuncSetAttribute 0.05% 3.
|
| 3976 |
-
cudaDeviceSynchronize
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
-
Self CPU time total: 7.
|
| 3979 |
-
Self CUDA time total: 5.
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
-
torch_mem_eff
|
| 3990 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 3991 |
-
aten::scaled_dot_product_attention 0.
|
| 3992 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 3993 |
-
aten::_efficient_attention_forward 0.36%
|
| 3994 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 3995 |
-
aten::contiguous 0.
|
| 3996 |
-
aten::clone 0.
|
| 3997 |
-
aten::copy_ 0.
|
| 3998 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 3999 |
-
Activity Buffer Request
|
| 4000 |
-
aten::transpose 0.63%
|
| 4001 |
-
aten::as_strided 0.
|
| 4002 |
-
aten::empty_like 0.
|
| 4003 |
-
aten::empty 0.
|
| 4004 |
-
cudaLaunchKernel 1.
|
| 4005 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4006 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4007 |
-
cudaDeviceSynchronize
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
-
Self CPU time total: 7.
|
| 4010 |
-
Self CUDA time total: 6.
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
-
torch_mem_eff
|
| 4021 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4022 |
-
aten::scaled_dot_product_attention 0.
|
| 4023 |
-
aten::_scaled_dot_product_efficient_attention 0.23% 17.
|
| 4024 |
-
aten::_efficient_attention_forward 0.
|
| 4025 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4026 |
-
aten::contiguous 0.
|
| 4027 |
-
aten::clone 0.
|
| 4028 |
-
aten::copy_ 0.
|
| 4029 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4030 |
-
Activity Buffer Request
|
| 4031 |
-
aten::transpose 0.
|
| 4032 |
-
aten::as_strided 0.
|
| 4033 |
-
aten::empty_like 0.
|
| 4034 |
-
aten::empty 0.
|
| 4035 |
-
cudaLaunchKernel
|
| 4036 |
-
cudaStreamIsCapturing 0.03% 2.
|
| 4037 |
-
cudaFuncSetAttribute 0.
|
| 4038 |
-
cudaDeviceSynchronize
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
-
Self CPU time total: 7.
|
| 4041 |
-
Self CUDA time total: 6.
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
-
torch_mem_eff
|
| 4052 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4053 |
-
aten::scaled_dot_product_attention 0.
|
| 4054 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4055 |
-
aten::_efficient_attention_forward 0.
|
| 4056 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.
|
| 4057 |
-
aten::contiguous 0.
|
| 4058 |
-
aten::clone 0.27%
|
| 4059 |
-
aten::copy_ 0.
|
| 4060 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4061 |
-
Activity Buffer Request 18.
|
| 4062 |
-
aten::transpose 0.
|
| 4063 |
-
aten::as_strided 0.
|
| 4064 |
-
aten::empty_like 0.15% 11.
|
| 4065 |
-
aten::empty 0.80%
|
| 4066 |
-
cudaLaunchKernel 3.
|
| 4067 |
-
cudaStreamIsCapturing 0.
|
| 4068 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4069 |
-
cudaDeviceSynchronize 71.
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
-
Self CPU time total: 8.
|
| 4072 |
-
Self CUDA time total: 6.
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
-
torch_mem_eff
|
| 4083 |
-
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.
|
| 4084 |
-
aten::scaled_dot_product_attention 0.22% 18.
|
| 4085 |
-
aten::_scaled_dot_product_efficient_attention 0.
|
| 4086 |
-
aten::_efficient_attention_forward 0.34%
|
| 4087 |
-
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4088 |
-
aten::contiguous 0.
|
| 4089 |
-
aten::clone 0.
|
| 4090 |
-
aten::copy_ 0.
|
| 4091 |
-
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us
|
| 4092 |
-
Activity Buffer Request 17.
|
| 4093 |
-
aten::transpose 0.
|
| 4094 |
-
aten::as_strided 0.20% 17.
|
| 4095 |
-
aten::empty_like 0.
|
| 4096 |
-
aten::empty 0.
|
| 4097 |
-
cudaLaunchKernel
|
| 4098 |
-
cudaStreamIsCapturing 0.
|
| 4099 |
-
cudaFuncSetAttribute 0.04% 3.
|
| 4100 |
-
cudaDeviceSynchronize
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
-
Self CPU time total: 8.
|
| 4103 |
-
Self CUDA time total: 6.
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4108 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4109 |
-
torch_mem_eff cuda_attn_L320_bfloat16
|
| 4110 |
torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
|
| 4111 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4112 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4113 |
</pre></div>
|
| 4114 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4115 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4116 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4117 |
-
Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4118 |
-
Downloading setuptools (1.1MiB)
|
| 4119 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4120 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4121 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4122 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4123 |
-
Downloading matplotlib (8.3MiB)
|
| 4124 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4125 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4126 |
-
Downloading pillow (6.7MiB)
|
| 4127 |
-
Downloading fonttools (4.7MiB)
|
| 4128 |
-
Downloading numpy (16.2MiB)
|
| 4129 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4130 |
-
Downloading torch (846.9MiB)
|
| 4131 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4132 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4133 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4134 |
-
Downloading kiwisolver (1.4MiB)
|
| 4135 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4136 |
-
Downloading sympy (6.0MiB)
|
| 4137 |
-
Downloading triton (148.3MiB)
|
| 4138 |
-
Downloading networkx (1.9MiB)
|
| 4139 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4140 |
-
Downloading nvidia-cufile-cu12
|
| 4141 |
-
Downloading kiwisolver
|
| 4142 |
-
Downloading setuptools
|
| 4143 |
-
Downloading fonttools
|
| 4144 |
-
Downloading networkx
|
| 4145 |
-
Downloading pillow
|
| 4146 |
-
Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
|
| 4147 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4148 |
-
Downloading matplotlib
|
| 4149 |
-
Downloading numpy
|
| 4150 |
-
Downloading sympy
|
| 4151 |
-
Downloading nvidia-nvjitlink-cu12
|
| 4152 |
-
Downloading nvidia-curand-cu12
|
| 4153 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4154 |
-
Downloading triton
|
| 4155 |
-
Downloading nvidia-cufft-cu12
|
| 4156 |
-
Downloading nvidia-cusolver-cu12
|
| 4157 |
-
Downloading nvidia-cusparse-cu12
|
| 4158 |
-
Downloading nvidia-cusparselt-cu12
|
| 4159 |
-
Downloading nvidia-nccl-cu12
|
| 4160 |
-
Downloading nvidia-cublas-cu12
|
| 4161 |
-
Downloading nvidia-cudnn-cu12
|
| 4162 |
-
Downloading torch
|
| 4163 |
-
Installed 37 packages in 223ms
|
| 4164 |
-
</div>
|
| 4165 |
-
</div>
|
| 4166 |
<div class="cell-artifacts">
|
| 4167 |
<h4>Artifacts:</h4>
|
| 4168 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 3.92s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3924 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3925 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3926 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3927 |
+
torch_mem_eff 4.88% 347.876us 33.28% 2.372ms 2.372ms 0.000us 0.00% 5.473ms 5.473ms 1
|
| 3928 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.465ms 100.77% 5.465ms 5.465ms 1
|
| 3929 |
+
aten::scaled_dot_product_attention 0.44% 31.501us 2.47% 176.074us 58.691us 0.000us 0.00% 4.806ms 1.602ms 3
|
| 3930 |
+
aten::_scaled_dot_product_efficient_attention 0.33% 23.351us 2.03% 144.573us 48.191us 0.000us 0.00% 4.806ms 1.602ms 3
|
| 3931 |
+
aten::_efficient_attention_forward 0.48% 33.995us 1.40% 99.622us 33.207us 4.806ms 88.63% 4.806ms 1.602ms 3
|
| 3932 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.806ms 88.63% 4.806ms 1.602ms 3
|
| 3933 |
+
aten::contiguous 0.20% 13.962us 24.98% 1.780ms 197.762us 0.000us 0.00% 667.264us 74.140us 9
|
| 3934 |
+
aten::clone 0.48% 34.432us 24.78% 1.766ms 196.211us 0.000us 0.00% 667.264us 74.140us 9
|
| 3935 |
+
aten::copy_ 1.03% 73.682us 23.27% 1.658ms 184.268us 616.768us 11.37% 667.264us 74.140us 9
|
| 3936 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.768us 11.37% 616.768us 68.530us 9
|
| 3937 |
+
Activity Buffer Request 21.06% 1.501ms 21.06% 1.501ms 1.501ms 50.496us 0.93% 50.496us 50.496us 1
|
| 3938 |
+
aten::transpose 0.94% 67.099us 1.26% 89.541us 3.731us 0.000us 0.00% 0.000us 0.000us 24
|
| 3939 |
+
aten::as_strided 0.31% 22.442us 0.31% 22.442us 0.935us 0.000us 0.00% 0.000us 0.000us 24
|
| 3940 |
+
aten::empty_like 0.26% 18.431us 1.03% 73.051us 8.117us 0.000us 0.00% 0.000us 0.000us 9
|
| 3941 |
+
aten::empty 1.15% 82.238us 1.15% 82.238us 3.916us 0.000us 0.00% 0.000us 0.000us 21
|
| 3942 |
+
cudaLaunchKernel 1.53% 109.170us 1.53% 109.170us 9.098us 0.000us 0.00% 0.000us 0.000us 12
|
| 3943 |
+
cudaStreamIsCapturing 0.04% 3.169us 0.04% 3.169us 1.056us 0.000us 0.00% 0.000us 0.000us 3
|
| 3944 |
+
cudaFuncSetAttribute 0.13% 9.530us 0.13% 9.530us 3.177us 0.000us 0.00% 0.000us 0.000us 3
|
| 3945 |
+
cudaDeviceSynchronize 66.72% 4.754ms 66.72% 4.754ms 4.754ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3946 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3947 |
+
Self CPU time total: 7.126ms
|
| 3948 |
+
Self CUDA time total: 5.423ms
|
| 3949 |
|
| 3950 |
|
| 3951 |
|
|
|
|
| 3955 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3956 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3957 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3958 |
+
torch_mem_eff 3.49% 251.026us 29.53% 2.123ms 2.123ms 0.000us 0.00% 5.671ms 5.671ms 1
|
| 3959 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.625ms 100.14% 5.625ms 5.625ms 1
|
| 3960 |
+
aten::scaled_dot_product_attention 0.28% 19.941us 1.97% 141.843us 47.281us 0.000us 0.00% 4.980ms 1.660ms 3
|
| 3961 |
+
aten::_scaled_dot_product_efficient_attention 0.25% 17.669us 1.70% 121.902us 40.634us 0.000us 0.00% 4.980ms 1.660ms 3
|
| 3962 |
+
aten::_efficient_attention_forward 0.38% 27.651us 1.14% 82.182us 27.394us 4.980ms 88.66% 4.980ms 1.660ms 3
|
| 3963 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.980ms 88.66% 4.980ms 1.660ms 3
|
| 3964 |
+
aten::contiguous 0.10% 7.480us 23.48% 1.688ms 187.567us 0.000us 0.00% 691.071us 76.786us 9
|
| 3965 |
+
aten::clone 0.30% 21.261us 23.38% 1.681ms 186.736us 0.000us 0.00% 691.071us 76.786us 9
|
| 3966 |
+
aten::copy_ 0.85% 60.983us 22.39% 1.610ms 178.842us 637.247us 11.34% 691.071us 76.786us 9
|
| 3967 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 637.247us 11.34% 637.247us 70.805us 9
|
| 3968 |
+
Activity Buffer Request 20.63% 1.483ms 20.63% 1.483ms 1.483ms 53.824us 0.96% 53.824us 53.824us 1
|
| 3969 |
+
aten::transpose 0.67% 48.164us 0.89% 64.122us 2.672us 0.000us 0.00% 0.000us 0.000us 24
|
| 3970 |
+
aten::as_strided 0.22% 15.958us 0.22% 15.958us 0.665us 0.000us 0.00% 0.000us 0.000us 24
|
| 3971 |
+
aten::empty_like 0.16% 11.580us 0.69% 49.790us 5.532us 0.000us 0.00% 0.000us 0.000us 9
|
| 3972 |
+
aten::empty 0.89% 63.701us 0.89% 63.701us 3.033us 0.000us 0.00% 0.000us 0.000us 21
|
| 3973 |
+
cudaLaunchKernel 1.22% 87.751us 1.22% 87.751us 7.313us 0.000us 0.00% 0.000us 0.000us 12
|
| 3974 |
+
cudaStreamIsCapturing 0.04% 3.090us 0.04% 3.090us 1.030us 0.000us 0.00% 0.000us 0.000us 3
|
| 3975 |
+
cudaFuncSetAttribute 0.05% 3.339us 0.05% 3.339us 1.113us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaDeviceSynchronize 70.47% 5.066ms 70.47% 5.066ms 5.066ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3977 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3978 |
+
Self CPU time total: 7.190ms
|
| 3979 |
+
Self CUDA time total: 5.617ms
|
| 3980 |
|
| 3981 |
|
| 3982 |
|
|
|
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3988 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3989 |
+
torch_mem_eff 3.37% 266.115us 31.17% 2.458ms 2.458ms 0.000us 0.00% 6.082ms 6.082ms 1
|
| 3990 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.032ms 100.14% 6.032ms 6.032ms 1
|
| 3991 |
+
aten::scaled_dot_product_attention 0.25% 19.720us 1.92% 151.403us 50.468us 0.000us 0.00% 5.369ms 1.790ms 3
|
| 3992 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 18.800us 1.67% 131.683us 43.894us 0.000us 0.00% 5.369ms 1.790ms 3
|
| 3993 |
+
aten::_efficient_attention_forward 0.36% 28.452us 1.04% 81.963us 27.321us 5.369ms 89.14% 5.369ms 1.790ms 3
|
| 3994 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.369ms 89.14% 5.369ms 1.790ms 3
|
| 3995 |
+
aten::contiguous 0.10% 7.851us 25.32% 1.997ms 221.887us 0.000us 0.00% 712.865us 79.207us 9
|
| 3996 |
+
aten::clone 0.51% 40.412us 25.22% 1.989ms 221.015us 0.000us 0.00% 712.865us 79.207us 9
|
| 3997 |
+
aten::copy_ 0.83% 65.138us 24.07% 1.898ms 210.924us 654.369us 10.86% 712.865us 79.207us 9
|
| 3998 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.369us 10.86% 654.369us 72.708us 9
|
| 3999 |
+
Activity Buffer Request 22.37% 1.764ms 22.37% 1.764ms 1.764ms 58.496us 0.97% 58.496us 58.496us 1
|
| 4000 |
+
aten::transpose 0.63% 49.872us 0.95% 74.812us 3.117us 0.000us 0.00% 0.000us 0.000us 24
|
| 4001 |
+
aten::as_strided 0.32% 24.940us 0.32% 24.940us 1.039us 0.000us 0.00% 0.000us 0.000us 24
|
| 4002 |
+
aten::empty_like 0.15% 11.509us 0.64% 50.401us 5.600us 0.000us 0.00% 0.000us 0.000us 9
|
| 4003 |
+
aten::empty 0.82% 64.330us 0.82% 64.330us 3.063us 0.000us 0.00% 0.000us 0.000us 21
|
| 4004 |
+
cudaLaunchKernel 1.16% 91.554us 1.16% 91.554us 7.629us 0.000us 0.00% 0.000us 0.000us 12
|
| 4005 |
+
cudaStreamIsCapturing 0.03% 2.671us 0.03% 2.671us 0.890us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaFuncSetAttribute 0.04% 3.101us 0.04% 3.101us 1.034us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
cudaDeviceSynchronize 68.83% 5.428ms 68.83% 5.428ms 5.428ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4008 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4009 |
+
Self CPU time total: 7.886ms
|
| 4010 |
+
Self CUDA time total: 6.024ms
|
| 4011 |
|
| 4012 |
|
| 4013 |
|
|
|
|
| 4017 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4018 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
+
torch_mem_eff 4.19% 329.379us 30.22% 2.377ms 2.377ms 0.000us 0.00% 6.195ms 6.195ms 1
|
| 4021 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.146ms 100.15% 6.146ms 6.146ms 1
|
| 4022 |
+
aten::scaled_dot_product_attention 0.26% 20.400us 1.80% 141.523us 47.174us 0.000us 0.00% 5.484ms 1.828ms 3
|
| 4023 |
+
aten::_scaled_dot_product_efficient_attention 0.23% 17.780us 1.54% 121.123us 40.374us 0.000us 0.00% 5.484ms 1.828ms 3
|
| 4024 |
+
aten::_efficient_attention_forward 0.36% 28.239us 1.03% 81.303us 27.101us 5.484ms 89.36% 5.484ms 1.828ms 3
|
| 4025 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.484ms 89.36% 5.484ms 1.828ms 3
|
| 4026 |
+
aten::contiguous 0.10% 8.071us 23.69% 1.863ms 207.042us 0.000us 0.00% 711.166us 79.018us 9
|
| 4027 |
+
aten::clone 0.27% 21.510us 23.59% 1.855ms 206.145us 0.000us 0.00% 711.166us 79.018us 9
|
| 4028 |
+
aten::copy_ 0.81% 63.940us 22.65% 1.781ms 197.883us 652.767us 10.64% 711.166us 79.018us 9
|
| 4029 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 652.767us 10.64% 652.767us 72.530us 9
|
| 4030 |
+
Activity Buffer Request 18.20% 1.431ms 18.20% 1.431ms 1.431ms 58.399us 0.95% 58.399us 58.399us 1
|
| 4031 |
+
aten::transpose 0.61% 48.309us 0.82% 64.340us 2.681us 0.000us 0.00% 0.000us 0.000us 24
|
| 4032 |
+
aten::as_strided 0.20% 16.031us 0.20% 16.031us 0.668us 0.000us 0.00% 0.000us 0.000us 24
|
| 4033 |
+
aten::empty_like 0.14% 11.029us 0.67% 52.851us 5.872us 0.000us 0.00% 0.000us 0.000us 9
|
| 4034 |
+
aten::empty 0.84% 66.365us 0.84% 66.365us 3.160us 0.000us 0.00% 0.000us 0.000us 21
|
| 4035 |
+
cudaLaunchKernel 3.91% 307.476us 3.91% 307.476us 25.623us 0.000us 0.00% 0.000us 0.000us 12
|
| 4036 |
+
cudaStreamIsCapturing 0.03% 2.550us 0.03% 2.550us 0.850us 0.000us 0.00% 0.000us 0.000us 3
|
| 4037 |
+
cudaFuncSetAttribute 0.05% 4.011us 0.05% 4.011us 1.337us 0.000us 0.00% 0.000us 0.000us 3
|
| 4038 |
+
cudaDeviceSynchronize 69.78% 5.488ms 69.78% 5.488ms 5.488ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4039 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4040 |
+
Self CPU time total: 7.864ms
|
| 4041 |
+
Self CUDA time total: 6.137ms
|
| 4042 |
|
| 4043 |
|
| 4044 |
|
|
|
|
| 4048 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4049 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4050 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4051 |
+
torch_mem_eff 3.07% 246.275us 28.09% 2.251ms 2.251ms 0.000us 0.00% 6.379ms 6.379ms 1
|
| 4052 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.328ms 100.14% 6.328ms 6.328ms 1
|
| 4053 |
+
aten::scaled_dot_product_attention 0.24% 19.011us 1.78% 142.253us 47.418us 0.000us 0.00% 5.653ms 1.884ms 3
|
| 4054 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 19.261us 1.54% 123.242us 41.081us 0.000us 0.00% 5.653ms 1.884ms 3
|
| 4055 |
+
aten::_efficient_attention_forward 0.35% 28.069us 1.02% 81.511us 27.170us 5.653ms 89.46% 5.653ms 1.884ms 3
|
| 4056 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.653ms 89.46% 5.653ms 1.884ms 3
|
| 4057 |
+
aten::contiguous 0.10% 7.649us 22.70% 1.819ms 202.115us 0.000us 0.00% 725.600us 80.622us 9
|
| 4058 |
+
aten::clone 0.27% 22.011us 22.61% 1.811ms 201.265us 0.000us 0.00% 725.600us 80.622us 9
|
| 4059 |
+
aten::copy_ 0.79% 63.041us 21.68% 1.737ms 193.055us 666.112us 10.54% 725.600us 80.622us 9
|
| 4060 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 666.112us 10.54% 666.112us 74.012us 9
|
| 4061 |
+
Activity Buffer Request 18.14% 1.453ms 18.14% 1.453ms 1.453ms 59.488us 0.94% 59.488us 59.488us 1
|
| 4062 |
+
aten::transpose 0.62% 49.849us 0.82% 66.103us 2.754us 0.000us 0.00% 0.000us 0.000us 24
|
| 4063 |
+
aten::as_strided 0.20% 16.254us 0.20% 16.254us 0.677us 0.000us 0.00% 0.000us 0.000us 24
|
| 4064 |
+
aten::empty_like 0.15% 11.889us 0.65% 51.880us 5.764us 0.000us 0.00% 0.000us 0.000us 9
|
| 4065 |
+
aten::empty 0.80% 64.291us 0.80% 64.291us 3.061us 0.000us 0.00% 0.000us 0.000us 21
|
| 4066 |
+
cudaLaunchKernel 3.04% 243.917us 3.04% 243.917us 20.326us 0.000us 0.00% 0.000us 0.000us 12
|
| 4067 |
+
cudaStreamIsCapturing 0.04% 3.200us 0.04% 3.200us 1.067us 0.000us 0.00% 0.000us 0.000us 3
|
| 4068 |
+
cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
|
| 4069 |
+
cudaDeviceSynchronize 71.91% 5.762ms 71.91% 5.762ms 5.762ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4070 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4071 |
+
Self CPU time total: 8.013ms
|
| 4072 |
+
Self CUDA time total: 6.319ms
|
| 4073 |
|
| 4074 |
|
| 4075 |
|
|
|
|
| 4079 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4080 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4081 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4082 |
+
torch_mem_eff 2.99% 249.826us 26.96% 2.254ms 2.254ms 0.000us 0.00% 6.738ms 6.738ms 1
|
| 4083 |
+
torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.686ms 100.13% 6.686ms 6.686ms 1
|
| 4084 |
+
aten::scaled_dot_product_attention 0.22% 18.532us 1.72% 143.464us 47.821us 0.000us 0.00% 6.005ms 2.002ms 3
|
| 4085 |
+
aten::_scaled_dot_product_efficient_attention 0.24% 19.750us 1.49% 124.932us 41.644us 0.000us 0.00% 6.005ms 2.002ms 3
|
| 4086 |
+
aten::_efficient_attention_forward 0.34% 28.159us 0.97% 81.312us 27.104us 6.005ms 89.92% 6.005ms 2.002ms 3
|
| 4087 |
+
fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.005ms 89.92% 6.005ms 2.002ms 3
|
| 4088 |
+
aten::contiguous 0.11% 8.892us 21.70% 1.814ms 201.591us 0.000us 0.00% 733.564us 81.507us 9
|
| 4089 |
+
aten::clone 0.28% 23.489us 21.59% 1.805ms 200.603us 0.000us 0.00% 733.564us 81.507us 9
|
| 4090 |
+
aten::copy_ 0.78% 65.381us 20.67% 1.729ms 192.090us 672.957us 10.08% 733.564us 81.507us 9
|
| 4091 |
+
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.957us 10.08% 672.957us 74.773us 9
|
| 4092 |
+
Activity Buffer Request 17.24% 1.442ms 17.24% 1.442ms 1.442ms 60.607us 0.91% 60.607us 60.607us 1
|
| 4093 |
+
aten::transpose 0.64% 53.558us 0.84% 70.590us 2.941us 0.000us 0.00% 0.000us 0.000us 24
|
| 4094 |
+
aten::as_strided 0.20% 17.032us 0.20% 17.032us 0.710us 0.000us 0.00% 0.000us 0.000us 24
|
| 4095 |
+
aten::empty_like 0.15% 12.490us 0.64% 53.131us 5.903us 0.000us 0.00% 0.000us 0.000us 9
|
| 4096 |
+
aten::empty 0.79% 65.813us 0.79% 65.813us 3.134us 0.000us 0.00% 0.000us 0.000us 21
|
| 4097 |
+
cudaLaunchKernel 2.91% 243.356us 2.91% 243.356us 20.280us 0.000us 0.00% 0.000us 0.000us 12
|
| 4098 |
+
cudaStreamIsCapturing 0.04% 3.000us 0.04% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3
|
| 4099 |
+
cudaFuncSetAttribute 0.04% 3.289us 0.04% 3.289us 1.096us 0.000us 0.00% 0.000us 0.000us 3
|
| 4100 |
+
cudaDeviceSynchronize 73.04% 6.108ms 73.04% 6.108ms 6.108ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4101 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4102 |
+
Self CPU time total: 8.362ms
|
| 4103 |
+
Self CUDA time total: 6.678ms
|
| 4104 |
|
| 4105 |
|
| 4106 |
impl wl p50(ms) ok
|
| 4107 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
|
| 4108 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.91 True
|
| 4109 |
+
torch_mem_eff cuda_attn_L320_bfloat16 1.96 True
|
| 4110 |
torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
|
| 4111 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.10 True
|
| 4112 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.18 True
|
| 4113 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4114 |
<div class="cell-artifacts">
|
| 4115 |
<h4>Artifacts:</h4>
|
| 4116 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
-
<span id="uv-indicator-benchmark" style="cursor:
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark | 4.
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3920,23 +3920,28 @@ Cell: benchmark | 4.32s
|
|
| 3920 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3921 |
impl wl p50(ms) ok
|
| 3922 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3923 |
-
Error: module '
|
| 3924 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3925 |
-
Error: module '
|
| 3926 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3927 |
-
Error: module '
|
| 3928 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3929 |
-
Error: module '
|
| 3930 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3931 |
-
Error: module '
|
| 3932 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3933 |
-
Error: module '
|
| 3934 |
</pre></div>
|
| 3935 |
-
<div class="
|
| 3936 |
-
|
| 3937 |
-
|
| 3938 |
-
|
| 3939 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3940 |
<div class="cell-artifacts">
|
| 3941 |
<h4>Artifacts:</h4>
|
| 3942 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
|
|
|
| 3869 |
<span class="collapse-indicators">
|
| 3870 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
+
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 4.85s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3920 |
<div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
|
| 3921 |
impl wl p50(ms) ok
|
| 3922 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 3923 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 3924 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 3925 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 3926 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 3927 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 3928 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 3929 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 3930 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 3931 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 3932 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 3933 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 3934 |
</pre></div>
|
| 3935 |
+
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3936 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3937 |
+
<div class="uv-logs-content" style="display: none;">
|
| 3938 |
+
Installed 15 packages in 14ms
|
| 3939 |
</div>
|
| 3940 |
+
</div>
|
| 3941 |
+
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3942 |
+
Fetching 11 files: 27%|██▋ | 3/11 [00:00<00:00, 25.80it/s]
|
| 3943 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 12.20it/s]
|
| 3944 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 17.81it/s]</div>
|
| 3945 |
<div class="cell-artifacts">
|
| 3946 |
<h4>Artifacts:</h4>
|
| 3947 |
<a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
-
xformers_meff 10.
|
| 3927 |
-
xformers_flash3::flash_fwd 4.
|
| 3928 |
-
flash_attn_3::fwd 1.
|
| 3929 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3930 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3931 |
-
Activity Buffer Request 31.
|
| 3932 |
-
aten::empty 0.
|
| 3933 |
-
cudaFuncSetAttribute 0.
|
| 3934 |
-
cudaLaunchKernel 0.
|
| 3935 |
-
aten::reshape 0.26% 11.
|
| 3936 |
-
aten::view 0.
|
| 3937 |
-
cudaDeviceSynchronize 48.
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
-
Self CPU time total: 4.
|
| 3940 |
-
Self CUDA time total: 2.
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
-
xformers_meff 7.
|
| 3951 |
-
xformers_flash3::flash_fwd 3.
|
| 3952 |
-
flash_attn_3::fwd 1.
|
| 3953 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3954 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3955 |
-
Activity Buffer Request
|
| 3956 |
-
aten::empty 0.
|
| 3957 |
-
cudaFuncSetAttribute 0.
|
| 3958 |
-
cudaLaunchKernel 0.
|
| 3959 |
-
aten::reshape 0.
|
| 3960 |
-
aten::view 0.
|
| 3961 |
-
cudaDeviceSynchronize
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
-
Self CPU time total: 4.
|
| 3964 |
-
Self CUDA time total: 2.
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
-
xformers_meff 6.
|
| 3975 |
-
xformers_flash3::flash_fwd 3.
|
| 3976 |
-
flash_attn_3::fwd 1.
|
| 3977 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3978 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3979 |
-
Activity Buffer Request 31.
|
| 3980 |
-
aten::empty 0.
|
| 3981 |
-
cudaFuncSetAttribute 0.
|
| 3982 |
-
cudaLaunchKernel 0.
|
| 3983 |
-
aten::reshape 0.19% 8.
|
| 3984 |
-
aten::view 0.
|
| 3985 |
-
cudaDeviceSynchronize
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
-
Self CPU time total: 4.
|
| 3988 |
-
Self CUDA time total: 2.
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
-
xformers_meff 6.
|
| 3999 |
-
xformers_flash3::flash_fwd 3.
|
| 4000 |
-
flash_attn_3::fwd 1.
|
| 4001 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4002 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 4003 |
-
Activity Buffer Request 30.
|
| 4004 |
-
aten::empty 0.
|
| 4005 |
-
cudaFuncSetAttribute 0.
|
| 4006 |
-
cudaLaunchKernel
|
| 4007 |
-
aten::reshape 0.
|
| 4008 |
-
aten::view 0.
|
| 4009 |
-
cudaDeviceSynchronize 52.
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
-
Self CPU time total: 4.
|
| 4012 |
-
Self CUDA time total: 2.
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
-
xformers_meff 6.
|
| 4023 |
-
xformers_flash3::flash_fwd
|
| 4024 |
-
flash_attn_3::fwd 1.
|
| 4025 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4026 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4027 |
-
Activity Buffer Request
|
| 4028 |
-
aten::empty 0.
|
| 4029 |
-
cudaFuncSetAttribute 0.12% 6.
|
| 4030 |
-
cudaLaunchKernel 3.
|
| 4031 |
-
aten::reshape 0.
|
| 4032 |
-
aten::view 0.29%
|
| 4033 |
-
cudaDeviceSynchronize
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
-
Self CPU time total: 5.
|
| 4036 |
-
Self CUDA time total: 3.
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
@@ -4043,37 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
-
xformers_meff 6.
|
| 4047 |
-
xformers_flash3::flash_fwd
|
| 4048 |
-
flash_attn_3::fwd 1.
|
| 4049 |
-
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4050 |
-
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.
|
| 4051 |
-
Activity Buffer Request 27.
|
| 4052 |
-
aten::empty 0.
|
| 4053 |
-
cudaFuncSetAttribute 0.11% 5.
|
| 4054 |
-
cudaLaunchKernel 3.
|
| 4055 |
-
aten::reshape 0.
|
| 4056 |
-
aten::view 0.
|
| 4057 |
-
cudaDeviceSynchronize 56.
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
-
Self CPU time total: 5.
|
| 4060 |
-
Self CUDA time total: 3.
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4065 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4066 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4067 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4068 |
xformers_meff cuda_attn_L448_bfloat16 1.24 True
|
| 4069 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4073 |
<div class="uv-logs-content" style="display: none;">
|
| 4074 |
Downloading xformers (111.8MiB)
|
| 4075 |
Downloading xformers
|
| 4076 |
-
Installed
|
| 4077 |
</div>
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark | 8.72s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3877 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3923 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3924 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3925 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3926 |
+
xformers_meff 10.73% 480.812us 51.38% 2.302ms 2.302ms 0.000us 0.00% 3.631ms 3.631ms 1
|
| 3927 |
+
xformers_flash3::flash_fwd 4.61% 206.363us 39.81% 1.783ms 594.453us 0.000us 0.00% 3.631ms 1.210ms 3
|
| 3928 |
+
flash_attn_3::fwd 1.72% 77.043us 35.21% 1.577ms 525.665us 2.730ms 100.00% 3.631ms 1.210ms 3
|
| 3929 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.731ms 100.06% 2.731ms 2.731ms 1
|
| 3930 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.730ms 100.00% 2.730ms 909.864us 3
|
| 3931 |
+
Activity Buffer Request 31.52% 1.412ms 31.52% 1.412ms 1.412ms 901.213us 33.02% 901.213us 901.213us 1
|
| 3932 |
+
aten::empty 0.77% 34.510us 0.77% 34.510us 5.752us 0.000us 0.00% 0.000us 0.000us 6
|
| 3933 |
+
cudaFuncSetAttribute 0.24% 10.880us 0.24% 10.880us 3.627us 0.000us 0.00% 0.000us 0.000us 3
|
| 3934 |
+
cudaLaunchKernel 0.96% 42.842us 0.96% 42.842us 14.281us 0.000us 0.00% 0.000us 0.000us 3
|
| 3935 |
+
aten::reshape 0.26% 11.610us 0.84% 37.430us 6.238us 0.000us 0.00% 0.000us 0.000us 6
|
| 3936 |
+
aten::view 0.58% 25.820us 0.58% 25.820us 4.303us 0.000us 0.00% 0.000us 0.000us 6
|
| 3937 |
+
cudaDeviceSynchronize 48.62% 2.178ms 48.62% 2.178ms 2.178ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3938 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3939 |
+
Self CPU time total: 4.479ms
|
| 3940 |
+
Self CUDA time total: 2.730ms
|
| 3941 |
|
| 3942 |
|
| 3943 |
|
|
|
|
| 3947 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3948 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3949 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3950 |
+
xformers_meff 7.14% 318.116us 45.64% 2.033ms 2.033ms 0.000us 0.00% 3.819ms 3.819ms 1
|
| 3951 |
+
xformers_flash3::flash_fwd 3.43% 153.034us 38.00% 1.693ms 564.339us 0.000us 0.00% 3.819ms 1.273ms 3
|
| 3952 |
+
flash_attn_3::fwd 1.25% 55.902us 34.56% 1.540ms 513.328us 2.852ms 100.00% 3.819ms 1.273ms 3
|
| 3953 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.853ms 100.05% 2.853ms 2.853ms 1
|
| 3954 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.852ms 100.00% 2.852ms 950.587us 3
|
| 3955 |
+
Activity Buffer Request 31.72% 1.413ms 31.72% 1.413ms 1.413ms 967.259us 33.92% 967.259us 967.259us 1
|
| 3956 |
+
aten::empty 0.68% 30.270us 0.68% 30.270us 5.045us 0.000us 0.00% 0.000us 0.000us 6
|
| 3957 |
+
cudaFuncSetAttribute 0.13% 5.700us 0.13% 5.700us 1.900us 0.000us 0.00% 0.000us 0.000us 3
|
| 3958 |
+
cudaLaunchKernel 0.78% 34.811us 0.78% 34.811us 11.604us 0.000us 0.00% 0.000us 0.000us 3
|
| 3959 |
+
aten::reshape 0.19% 8.522us 0.50% 22.121us 3.687us 0.000us 0.00% 0.000us 0.000us 6
|
| 3960 |
+
aten::view 0.31% 13.599us 0.31% 13.599us 2.266us 0.000us 0.00% 0.000us 0.000us 6
|
| 3961 |
+
cudaDeviceSynchronize 54.36% 2.422ms 54.36% 2.422ms 2.422ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3962 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3963 |
+
Self CPU time total: 4.455ms
|
| 3964 |
+
Self CUDA time total: 2.852ms
|
| 3965 |
|
| 3966 |
|
| 3967 |
|
|
|
|
| 3971 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3972 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3973 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3974 |
+
xformers_meff 6.88% 312.747us 44.90% 2.040ms 2.040ms 0.000us 0.00% 3.937ms 3.937ms 1
|
| 3975 |
+
xformers_flash3::flash_fwd 3.35% 152.284us 37.52% 1.705ms 568.205us 0.000us 0.00% 3.937ms 1.312ms 3
|
| 3976 |
+
flash_attn_3::fwd 1.19% 54.281us 34.17% 1.552ms 517.444us 2.934ms 100.00% 3.937ms 1.312ms 3
|
| 3977 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.936ms 100.05% 2.936ms 2.936ms 1
|
| 3978 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.934ms 100.00% 2.934ms 977.979us 3
|
| 3979 |
+
Activity Buffer Request 31.39% 1.426ms 31.39% 1.426ms 1.426ms 1.003ms 34.19% 1.003ms 1.003ms 1
|
| 3980 |
+
aten::empty 0.67% 30.639us 0.67% 30.639us 5.106us 0.000us 0.00% 0.000us 0.000us 6
|
| 3981 |
+
cudaFuncSetAttribute 0.14% 6.530us 0.14% 6.530us 2.177us 0.000us 0.00% 0.000us 0.000us 3
|
| 3982 |
+
cudaLaunchKernel 0.77% 34.781us 0.77% 34.781us 11.594us 0.000us 0.00% 0.000us 0.000us 3
|
| 3983 |
+
aten::reshape 0.19% 8.650us 0.49% 22.320us 3.720us 0.000us 0.00% 0.000us 0.000us 6
|
| 3984 |
+
aten::view 0.30% 13.670us 0.30% 13.670us 2.278us 0.000us 0.00% 0.000us 0.000us 6
|
| 3985 |
+
cudaDeviceSynchronize 55.10% 2.503ms 55.10% 2.503ms 2.503ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3986 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3987 |
+
Self CPU time total: 4.543ms
|
| 3988 |
+
Self CUDA time total: 2.934ms
|
| 3989 |
|
| 3990 |
|
| 3991 |
|
|
|
|
| 3995 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3996 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3997 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3998 |
+
xformers_meff 6.56% 308.746us 47.29% 2.227ms 2.227ms 0.000us 0.00% 3.897ms 3.897ms 1
|
| 3999 |
+
xformers_flash3::flash_fwd 3.22% 151.743us 40.27% 1.897ms 632.183us 0.000us 0.00% 3.897ms 1.299ms 3
|
| 4000 |
+
flash_attn_3::fwd 1.19% 56.081us 37.05% 1.745ms 581.602us 2.911ms 100.00% 3.897ms 1.299ms 3
|
| 4001 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.913ms 100.05% 2.913ms 2.913ms 1
|
| 4002 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.00% 2.911ms 970.491us 3
|
| 4003 |
+
Activity Buffer Request 30.05% 1.415ms 30.05% 1.415ms 1.415ms 985.179us 33.84% 985.179us 985.179us 1
|
| 4004 |
+
aten::empty 0.65% 30.820us 0.65% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6
|
| 4005 |
+
cudaFuncSetAttribute 0.13% 6.030us 0.13% 6.030us 2.010us 0.000us 0.00% 0.000us 0.000us 3
|
| 4006 |
+
cudaLaunchKernel 5.02% 236.645us 5.02% 236.645us 78.882us 0.000us 0.00% 0.000us 0.000us 3
|
| 4007 |
+
aten::reshape 0.18% 8.502us 0.47% 22.111us 3.685us 0.000us 0.00% 0.000us 0.000us 6
|
| 4008 |
+
aten::view 0.29% 13.609us 0.29% 13.609us 2.268us 0.000us 0.00% 0.000us 0.000us 6
|
| 4009 |
+
cudaDeviceSynchronize 52.71% 2.482ms 52.71% 2.482ms 2.482ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4010 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4011 |
+
Self CPU time total: 4.710ms
|
| 4012 |
+
Self CUDA time total: 2.911ms
|
| 4013 |
|
| 4014 |
|
| 4015 |
|
|
|
|
| 4019 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4020 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4021 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4022 |
+
xformers_meff 6.33% 326.758us 43.32% 2.236ms 2.236ms 0.000us 0.00% 4.559ms 4.559ms 1
|
| 4023 |
+
xformers_flash3::flash_fwd 3.59% 185.275us 36.53% 1.885ms 628.414us 0.000us 0.00% 4.559ms 1.520ms 3
|
| 4024 |
+
flash_attn_3::fwd 1.12% 57.990us 32.94% 1.700ms 566.655us 3.412ms 100.00% 4.559ms 1.520ms 3
|
| 4025 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.05% 3.413ms 3.413ms 1
|
| 4026 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.412ms 100.00% 3.412ms 1.137ms 3
|
| 4027 |
+
Activity Buffer Request 27.43% 1.416ms 27.43% 1.416ms 1.416ms 1.147ms 33.63% 1.147ms 1.147ms 1
|
| 4028 |
+
aten::empty 0.66% 34.131us 0.66% 34.131us 5.688us 0.000us 0.00% 0.000us 0.000us 6
|
| 4029 |
+
cudaFuncSetAttribute 0.12% 6.360us 0.12% 6.360us 2.120us 0.000us 0.00% 0.000us 0.000us 3
|
| 4030 |
+
cudaLaunchKernel 3.60% 185.845us 3.60% 185.845us 61.948us 0.000us 0.00% 0.000us 0.000us 3
|
| 4031 |
+
aten::reshape 0.17% 8.790us 0.46% 23.539us 3.923us 0.000us 0.00% 0.000us 0.000us 6
|
| 4032 |
+
aten::view 0.29% 14.749us 0.29% 14.749us 2.458us 0.000us 0.00% 0.000us 0.000us 6
|
| 4033 |
+
cudaDeviceSynchronize 56.68% 2.925ms 56.68% 2.925ms 2.925ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
+
Self CPU time total: 5.161ms
|
| 4036 |
+
Self CUDA time total: 3.412ms
|
| 4037 |
|
| 4038 |
|
| 4039 |
|
|
|
|
| 4043 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4044 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4045 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4046 |
+
xformers_meff 6.07% 310.905us 43.25% 2.215ms 2.215ms 0.000us 0.00% 4.499ms 4.499ms 1
|
| 4047 |
+
xformers_flash3::flash_fwd 3.55% 181.844us 36.73% 1.881ms 626.964us 0.000us 0.00% 4.499ms 1.500ms 3
|
| 4048 |
+
flash_attn_3::fwd 1.14% 58.453us 33.18% 1.699ms 566.349us 3.369ms 100.00% 4.499ms 1.500ms 3
|
| 4049 |
+
xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.371ms 100.06% 3.371ms 3.371ms 1
|
| 4050 |
+
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.369ms 100.00% 3.369ms 1.123ms 3
|
| 4051 |
+
Activity Buffer Request 27.78% 1.423ms 27.78% 1.423ms 1.423ms 1.130ms 33.54% 1.130ms 1.130ms 1
|
| 4052 |
+
aten::empty 0.65% 33.340us 0.65% 33.340us 5.557us 0.000us 0.00% 0.000us 0.000us 6
|
| 4053 |
+
cudaFuncSetAttribute 0.11% 5.670us 0.11% 5.670us 1.890us 0.000us 0.00% 0.000us 0.000us 3
|
| 4054 |
+
cudaLaunchKernel 3.50% 178.983us 3.50% 178.983us 59.661us 0.000us 0.00% 0.000us 0.000us 3
|
| 4055 |
+
aten::reshape 0.17% 8.671us 0.45% 22.942us 3.824us 0.000us 0.00% 0.000us 0.000us 6
|
| 4056 |
+
aten::view 0.28% 14.271us 0.28% 14.271us 2.378us 0.000us 0.00% 0.000us 0.000us 6
|
| 4057 |
+
cudaDeviceSynchronize 56.75% 2.906ms 56.75% 2.906ms 2.906ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4058 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4059 |
+
Self CPU time total: 5.120ms
|
| 4060 |
+
Self CUDA time total: 3.369ms
|
| 4061 |
|
| 4062 |
|
| 4063 |
impl wl p50(ms) ok
|
| 4064 |
+
xformers_meff cuda_attn_L128_bfloat16 1.01 True
|
| 4065 |
+
xformers_meff cuda_attn_L256_bfloat16 1.04 True
|
| 4066 |
+
xformers_meff cuda_attn_L320_bfloat16 1.10 True
|
| 4067 |
+
xformers_meff cuda_attn_L384_bfloat16 1.10 True
|
| 4068 |
xformers_meff cuda_attn_L448_bfloat16 1.24 True
|
| 4069 |
+
xformers_meff cuda_attn_L512_bfloat16 1.24 True
|
| 4070 |
</pre></div>
|
| 4071 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4072 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4073 |
<div class="uv-logs-content" style="display: none;">
|
| 4074 |
Downloading xformers (111.8MiB)
|
| 4075 |
Downloading xformers
|
| 4076 |
+
Installed 38 packages in 194ms
|
| 4077 |
</div>
|
| 4078 |
</div>
|
| 4079 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.svg
CHANGED
|
|
|
|
flash_attn/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
-
<path d="M 47.81
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
-
<path d="M 47.81
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
-
<path d="M 47.81
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
-
<path d="M 47.81
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
-
<path d="M 47.81
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
-
<path d="M 47.81
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_7">
|
| 4065 |
<g id="grid-y--8" class="grid grid-y">
|
| 4066 |
-
<path d="M 47.81
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="label--y" class="ylabel">
|
|
@@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="series--torch-flash-ma" class="series">
|
| 4082 |
-
<path d="M 83.607806
|
| 4083 |
<defs>
|
| 4084 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4085 |
</defs>
|
| 4086 |
<g clip-path="url(#p09feef2583)">
|
| 4087 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4088 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4089 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4090 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4091 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4092 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="series--torch-mem-eff" class="series">
|
| 4096 |
-
<path d="M 83.607806
|
| 4097 |
<defs>
|
| 4098 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4099 |
</defs>
|
| 4100 |
<g clip-path="url(#p09feef2583)">
|
| 4101 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4102 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4103 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4104 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4105 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4106 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
<g id="series--xformers-meff" class="series">
|
| 4110 |
-
<path d="M 83.607806
|
| 4111 |
<defs>
|
| 4112 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4113 |
</defs>
|
| 4114 |
<g clip-path="url(#p09feef2583)">
|
| 4115 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4116 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4117 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4118 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4119 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4120 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4121 |
</g>
|
| 4122 |
</g>
|
| 4123 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4124 |
-
<path d="M 83.607806
|
| 4125 |
<defs>
|
| 4126 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4127 |
</defs>
|
| 4128 |
<g clip-path="url(#p09feef2583)">
|
| 4129 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4130 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4131 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4132 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4133 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4134 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4138 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4139 |
<defs>
|
| 4140 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p09feef2583)">
|
| 4143 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4144 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4145 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4146 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4147 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4148 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4149 |
</g>
|
| 4150 |
</g>
|
| 4151 |
<g id="patch_3">
|
|
@@ -4230,7 +4230,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4230 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4231 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4232 |
</span> |
|
| 4233 |
-
Cell: combine | 4.
|
| 4234 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4235 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4236 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4338,47 +4338,47 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4338 |
|
| 4339 |
impl wl p50(ms) ok
|
| 4340 |
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
|
| 4341 |
-
hf_kernels_flash_attn cuda_attn_L256_bfloat16
|
| 4342 |
-
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.
|
| 4343 |
-
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.
|
| 4344 |
-
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.
|
| 4345 |
-
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.
|
| 4346 |
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
|
| 4347 |
-
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.
|
| 4348 |
-
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.
|
| 4349 |
-
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.
|
| 4350 |
-
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.
|
| 4351 |
-
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.
|
| 4352 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4353 |
-
Error: module '
|
| 4354 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4355 |
-
Error: module '
|
| 4356 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4357 |
-
Error: module '
|
| 4358 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4359 |
-
Error: module '
|
| 4360 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4361 |
-
Error: module '
|
| 4362 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4363 |
-
Error: module '
|
| 4364 |
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4365 |
-
torch_flash_ma cuda_attn_L256_bfloat16 1.
|
| 4366 |
torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
|
| 4367 |
torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
|
| 4368 |
-
torch_flash_ma cuda_attn_L448_bfloat16 1.
|
| 4369 |
-
torch_flash_ma cuda_attn_L512_bfloat16 1.
|
| 4370 |
-
torch_mem_eff cuda_attn_L128_bfloat16 1.
|
| 4371 |
-
torch_mem_eff cuda_attn_L256_bfloat16 1.
|
| 4372 |
-
torch_mem_eff cuda_attn_L320_bfloat16
|
| 4373 |
torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
|
| 4374 |
-
torch_mem_eff cuda_attn_L448_bfloat16 2.
|
| 4375 |
-
torch_mem_eff cuda_attn_L512_bfloat16 2.
|
| 4376 |
-
xformers_meff cuda_attn_L128_bfloat16
|
| 4377 |
-
xformers_meff cuda_attn_L256_bfloat16 1.
|
| 4378 |
-
xformers_meff cuda_attn_L320_bfloat16 1.
|
| 4379 |
-
xformers_meff cuda_attn_L384_bfloat16 1.
|
| 4380 |
xformers_meff cuda_attn_L448_bfloat16 1.24 True
|
| 4381 |
-
xformers_meff cuda_attn_L512_bfloat16 1.
|
| 4382 |
|
| 4383 |
GENERATING COMBINED VISUALIZATION
|
| 4384 |
|
|
@@ -4402,7 +4402,7 @@ Implementations included:
|
|
| 4402 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4403 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4404 |
<div class="uv-logs-content" style="display: none;">
|
| 4405 |
-
Installed 37 packages in
|
| 4406 |
</div>
|
| 4407 |
</div>
|
| 4408 |
<div class="cell-artifacts">
|
|
@@ -4415,7 +4415,7 @@ Installed 37 packages in 221ms
|
|
| 4415 |
<rdf:RDF>
|
| 4416 |
<ns2:Work>
|
| 4417 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4418 |
-
<dc:date>2025-10-
|
| 4419 |
<dc:format>image/svg+xml</dc:format>
|
| 4420 |
<dc:creator>
|
| 4421 |
<ns2:Agent>
|
|
@@ -4525,96 +4525,96 @@ Installed 37 packages in 221ms
|
|
| 4525 |
<g id="matplotlib.axis_2">
|
| 4526 |
<g id="ytick_1">
|
| 4527 |
<g id="grid-y--2" class="grid grid-y">
|
| 4528 |
-
<path d="M 47.81
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_7">
|
| 4531 |
<defs>
|
| 4532 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</defs>
|
| 4534 |
<g>
|
| 4535 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4536 |
</g>
|
| 4537 |
</g>
|
| 4538 |
<g id="text_7">
|
| 4539 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4540 |
</g>
|
| 4541 |
</g>
|
| 4542 |
<g id="ytick_2">
|
| 4543 |
<g id="grid-y--3" class="grid grid-y">
|
| 4544 |
-
<path d="M 47.81
|
| 4545 |
</g>
|
| 4546 |
<g id="line2d_8">
|
| 4547 |
<g>
|
| 4548 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4549 |
</g>
|
| 4550 |
</g>
|
| 4551 |
<g id="text_8">
|
| 4552 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="ytick_3">
|
| 4556 |
<g id="grid-y--4" class="grid grid-y">
|
| 4557 |
-
<path d="M 47.81
|
| 4558 |
</g>
|
| 4559 |
<g id="line2d_9">
|
| 4560 |
<g>
|
| 4561 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_9">
|
| 4565 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_4">
|
| 4569 |
<g id="grid-y--5" class="grid grid-y">
|
| 4570 |
-
<path d="M 47.81
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_10">
|
| 4573 |
<g>
|
| 4574 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_10">
|
| 4578 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_5">
|
| 4582 |
<g id="grid-y--6" class="grid grid-y">
|
| 4583 |
-
<path d="M 47.81
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_11">
|
| 4586 |
<g>
|
| 4587 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_11">
|
| 4591 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_6">
|
| 4595 |
<g id="grid-y--7" class="grid grid-y">
|
| 4596 |
-
<path d="M 47.81
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_12">
|
| 4599 |
<g>
|
| 4600 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_12">
|
| 4604 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_7">
|
| 4608 |
<g id="grid-y--8" class="grid grid-y">
|
| 4609 |
-
<path d="M 47.81
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_13">
|
| 4612 |
<g>
|
| 4613 |
-
<use ns4:href="#m0fca2865ba" x="47.81" y="
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_13">
|
| 4617 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
@@ -4622,73 +4622,73 @@ Installed 37 packages in 221ms
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-flash-ma" class="series">
|
| 4625 |
-
<path d="M 83.607806
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p09feef2583)">
|
| 4630 |
-
<use ns4:href="#md7efaf3aec" x="83.607806" y="
|
| 4631 |
-
<use ns4:href="#md7efaf3aec" x="226.799032" y="
|
| 4632 |
-
<use ns4:href="#md7efaf3aec" x="369.990258" y="
|
| 4633 |
-
<use ns4:href="#md7efaf3aec" x="513.181484" y="
|
| 4634 |
-
<use ns4:href="#md7efaf3aec" x="656.37271" y="
|
| 4635 |
-
<use ns4:href="#md7efaf3aec" x="799.563935" y="
|
| 4636 |
</g>
|
| 4637 |
</g>
|
| 4638 |
<g id="series--torch-mem-eff" class="series">
|
| 4639 |
-
<path d="M 83.607806
|
| 4640 |
<defs>
|
| 4641 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4642 |
</defs>
|
| 4643 |
<g clip-path="url(#p09feef2583)">
|
| 4644 |
-
<use ns4:href="#m9b8c54d372" x="83.607806" y="
|
| 4645 |
-
<use ns4:href="#m9b8c54d372" x="226.799032" y="
|
| 4646 |
-
<use ns4:href="#m9b8c54d372" x="369.990258" y="
|
| 4647 |
-
<use ns4:href="#m9b8c54d372" x="513.181484" y="
|
| 4648 |
-
<use ns4:href="#m9b8c54d372" x="656.37271" y="
|
| 4649 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="series--xformers-meff" class="series">
|
| 4653 |
-
<path d="M 83.607806
|
| 4654 |
<defs>
|
| 4655 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4656 |
</defs>
|
| 4657 |
<g clip-path="url(#p09feef2583)">
|
| 4658 |
-
<use ns4:href="#mc655281e0b" x="83.607806" y="
|
| 4659 |
-
<use ns4:href="#mc655281e0b" x="226.799032" y="
|
| 4660 |
-
<use ns4:href="#mc655281e0b" x="369.990258" y="
|
| 4661 |
-
<use ns4:href="#mc655281e0b" x="513.181484" y="
|
| 4662 |
-
<use ns4:href="#mc655281e0b" x="656.37271" y="
|
| 4663 |
-
<use ns4:href="#mc655281e0b" x="799.563935" y="
|
| 4664 |
</g>
|
| 4665 |
</g>
|
| 4666 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4667 |
-
<path d="M 83.607806
|
| 4668 |
<defs>
|
| 4669 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4670 |
</defs>
|
| 4671 |
<g clip-path="url(#p09feef2583)">
|
| 4672 |
-
<use ns4:href="#m61c8040d7e" x="83.607806" y="
|
| 4673 |
-
<use ns4:href="#m61c8040d7e" x="226.799032" y="
|
| 4674 |
-
<use ns4:href="#m61c8040d7e" x="369.990258" y="
|
| 4675 |
-
<use ns4:href="#m61c8040d7e" x="513.181484" y="
|
| 4676 |
-
<use ns4:href="#m61c8040d7e" x="656.37271" y="
|
| 4677 |
-
<use ns4:href="#m61c8040d7e" x="799.563935" y="
|
| 4678 |
</g>
|
| 4679 |
</g>
|
| 4680 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4681 |
-
<path d="M 83.607806 428.387702 L 226.799032
|
| 4682 |
<defs>
|
| 4683 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4684 |
</defs>
|
| 4685 |
<g clip-path="url(#p09feef2583)">
|
| 4686 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4687 |
-
<use ns4:href="#m7cd35be9cc" x="226.799032" y="
|
| 4688 |
-
<use ns4:href="#m7cd35be9cc" x="369.990258" y="
|
| 4689 |
-
<use ns4:href="#m7cd35be9cc" x="513.181484" y="
|
| 4690 |
-
<use ns4:href="#m7cd35be9cc" x="656.37271" y="
|
| 4691 |
-
<use ns4:href="#m7cd35be9cc" x="799.563935" y="
|
| 4692 |
</g>
|
| 4693 |
</g>
|
| 4694 |
<g id="patch_3">
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T04:14:54.057236</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 3982 |
<g id="matplotlib.axis_2">
|
| 3983 |
<g id="ytick_1">
|
| 3984 |
<g id="grid-y--2" class="grid grid-y">
|
| 3985 |
+
<path d="M 47.81 405.733213 L 835.361742 405.733213 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3986 |
</g>
|
| 3987 |
<g id="line2d_7">
|
| 3988 |
<defs>
|
| 3989 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3990 |
</defs>
|
| 3991 |
<g>
|
| 3992 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="405.733213" style="stroke: #000000; stroke-width: 0.8" />
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="409.532432" transform="rotate(-0 40.81 409.532432)">1.0</text>
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_2">
|
| 4000 |
<g id="grid-y--3" class="grid grid-y">
|
| 4001 |
+
<path d="M 47.81 344.636964 L 835.361742 344.636964 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="344.636964" style="stroke: #000000; stroke-width: 0.8" />
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="348.436183" transform="rotate(-0 40.81 348.436183)">1.2</text>
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_3">
|
| 4013 |
<g id="grid-y--4" class="grid grid-y">
|
| 4014 |
+
<path d="M 47.81 283.540715 L 835.361742 283.540715 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="283.540715" style="stroke: #000000; stroke-width: 0.8" />
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.339933" transform="rotate(-0 40.81 287.339933)">1.4</text>
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="ytick_4">
|
| 4026 |
<g id="grid-y--5" class="grid grid-y">
|
| 4027 |
+
<path d="M 47.81 222.444466 L 835.361742 222.444466 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4028 |
</g>
|
| 4029 |
<g id="line2d_10">
|
| 4030 |
<g>
|
| 4031 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="222.444466" style="stroke: #000000; stroke-width: 0.8" />
|
| 4032 |
</g>
|
| 4033 |
</g>
|
| 4034 |
<g id="text_10">
|
| 4035 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="226.243684" transform="rotate(-0 40.81 226.243684)">1.6</text>
|
| 4036 |
</g>
|
| 4037 |
</g>
|
| 4038 |
<g id="ytick_5">
|
| 4039 |
<g id="grid-y--6" class="grid grid-y">
|
| 4040 |
+
<path d="M 47.81 161.348216 L 835.361742 161.348216 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4041 |
</g>
|
| 4042 |
<g id="line2d_11">
|
| 4043 |
<g>
|
| 4044 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="161.348216" style="stroke: #000000; stroke-width: 0.8" />
|
| 4045 |
</g>
|
| 4046 |
</g>
|
| 4047 |
<g id="text_11">
|
| 4048 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="165.147435" transform="rotate(-0 40.81 165.147435)">1.8</text>
|
| 4049 |
</g>
|
| 4050 |
</g>
|
| 4051 |
<g id="ytick_6">
|
| 4052 |
<g id="grid-y--7" class="grid grid-y">
|
| 4053 |
+
<path d="M 47.81 100.251967 L 835.361742 100.251967 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4054 |
</g>
|
| 4055 |
<g id="line2d_12">
|
| 4056 |
<g>
|
| 4057 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="100.251967" style="stroke: #000000; stroke-width: 0.8" />
|
| 4058 |
</g>
|
| 4059 |
</g>
|
| 4060 |
<g id="text_12">
|
| 4061 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="104.051186" transform="rotate(-0 40.81 104.051186)">2.0</text>
|
| 4062 |
</g>
|
| 4063 |
</g>
|
| 4064 |
<g id="ytick_7">
|
| 4065 |
<g id="grid-y--8" class="grid grid-y">
|
| 4066 |
+
<path d="M 47.81 39.155718 L 835.361742 39.155718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4067 |
</g>
|
| 4068 |
<g id="line2d_13">
|
| 4069 |
<g>
|
| 4070 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="39.155718" style="stroke: #000000; stroke-width: 0.8" />
|
| 4071 |
</g>
|
| 4072 |
</g>
|
| 4073 |
<g id="text_13">
|
| 4074 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="42.954937" transform="rotate(-0 40.81 42.954937)">2.2</text>
|
| 4075 |
</g>
|
| 4076 |
</g>
|
| 4077 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4079 |
</g>
|
| 4080 |
</g>
|
| 4081 |
<g id="series--torch-flash-ma" class="series">
|
| 4082 |
+
<path d="M 83.607806 340.364503 L 226.799032 326.486185 L 369.990258 317.620813 L 513.181484 308.266672 L 656.37271 260.522092 L 799.563935 250.419827 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4083 |
<defs>
|
| 4084 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4085 |
</defs>
|
| 4086 |
<g clip-path="url(#p09feef2583)">
|
| 4087 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="340.364503" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4088 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="326.486185" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4089 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="317.620813" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4090 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="308.266672" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4091 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="260.522092" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4092 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="250.419827" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4093 |
</g>
|
| 4094 |
</g>
|
| 4095 |
<g id="series--torch-mem-eff" class="series">
|
| 4096 |
+
<path d="M 83.607806 150.170658 L 226.799032 128.245658 L 369.990258 112.32367 L 513.181484 87.053955 L 656.37271 71.037268 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4097 |
<defs>
|
| 4098 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4099 |
</defs>
|
| 4100 |
<g clip-path="url(#p09feef2583)">
|
| 4101 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="150.170658" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4102 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="128.245658" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4103 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="112.32367" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4104 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="87.053955" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4105 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="71.037268" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4106 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4107 |
</g>
|
| 4108 |
</g>
|
| 4109 |
<g id="series--xformers-meff" class="series">
|
| 4110 |
+
<path d="M 83.607806 403.792796 L 226.799032 392.954016 L 369.990258 376.494687 L 513.181484 374.771467 L 656.37271 331.266966 L 799.563935 332.076491 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4111 |
<defs>
|
| 4112 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4113 |
</defs>
|
| 4114 |
<g clip-path="url(#p09feef2583)">
|
| 4115 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="403.792796" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4116 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="392.954016" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4117 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="376.494687" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4118 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="374.771467" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4119 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="331.266966" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4120 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="332.076491" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4121 |
</g>
|
| 4122 |
</g>
|
| 4123 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4124 |
+
<path d="M 83.607806 420.124435 L 226.799032 406.579091 L 369.990258 388.286263 L 513.181484 382.130816 L 656.37271 337.645415 L 799.563935 338.021157 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4125 |
<defs>
|
| 4126 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4127 |
</defs>
|
| 4128 |
<g clip-path="url(#p09feef2583)">
|
| 4129 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="420.124435" style="fill: #d62728; stroke: #d62728" />
|
| 4130 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="406.579091" style="fill: #d62728; stroke: #d62728" />
|
| 4131 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="388.286263" style="fill: #d62728; stroke: #d62728" />
|
| 4132 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="382.130816" style="fill: #d62728; stroke: #d62728" />
|
| 4133 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="337.645415" style="fill: #d62728; stroke: #d62728" />
|
| 4134 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="338.021157" style="fill: #d62728; stroke: #d62728" />
|
| 4135 |
</g>
|
| 4136 |
</g>
|
| 4137 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4138 |
+
<path d="M 83.607806 428.387702 L 226.799032 415.233374 L 369.990258 396.467356 L 513.181484 398.1139 L 656.37271 345.939841 L 799.563935 352.101398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4139 |
<defs>
|
| 4140 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4141 |
</defs>
|
| 4142 |
<g clip-path="url(#p09feef2583)">
|
| 4143 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4144 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="415.233374" style="fill: #9467bd; stroke: #9467bd" />
|
| 4145 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="396.467356" style="fill: #9467bd; stroke: #9467bd" />
|
| 4146 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="398.1139" style="fill: #9467bd; stroke: #9467bd" />
|
| 4147 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="345.939841" style="fill: #9467bd; stroke: #9467bd" />
|
| 4148 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="352.101398" style="fill: #9467bd; stroke: #9467bd" />
|
| 4149 |
</g>
|
| 4150 |
</g>
|
| 4151 |
<g id="patch_3">
|
|
|
|
| 4230 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4231 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4232 |
</span> |
|
| 4233 |
+
Cell: combine | 4.23s
|
| 4234 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4235 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4236 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4338 |
|
| 4339 |
impl wl p50(ms) ok
|
| 4340 |
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
|
| 4341 |
+
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
|
| 4342 |
+
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
|
| 4343 |
+
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
|
| 4344 |
+
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
|
| 4345 |
+
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
|
| 4346 |
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
|
| 4347 |
+
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
|
| 4348 |
+
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
|
| 4349 |
+
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
|
| 4350 |
+
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
|
| 4351 |
+
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
|
| 4352 |
sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
|
| 4353 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 4354 |
sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
|
| 4355 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 4356 |
sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
|
| 4357 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 4358 |
sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
|
| 4359 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 4360 |
sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
|
| 4361 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 4362 |
sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
|
| 4363 |
+
Error: module 'sage_attention_717bd9367b3cdd60' has no attribute 'fwd'
|
| 4364 |
torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
|
| 4365 |
+
torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
|
| 4366 |
torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
|
| 4367 |
torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
|
| 4368 |
+
torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
|
| 4369 |
+
torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
|
| 4370 |
+
torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
|
| 4371 |
+
torch_mem_eff cuda_attn_L256_bfloat16 1.91 True
|
| 4372 |
+
torch_mem_eff cuda_attn_L320_bfloat16 1.96 True
|
| 4373 |
torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
|
| 4374 |
+
torch_mem_eff cuda_attn_L448_bfloat16 2.10 True
|
| 4375 |
+
torch_mem_eff cuda_attn_L512_bfloat16 2.18 True
|
| 4376 |
+
xformers_meff cuda_attn_L128_bfloat16 1.01 True
|
| 4377 |
+
xformers_meff cuda_attn_L256_bfloat16 1.04 True
|
| 4378 |
+
xformers_meff cuda_attn_L320_bfloat16 1.10 True
|
| 4379 |
+
xformers_meff cuda_attn_L384_bfloat16 1.10 True
|
| 4380 |
xformers_meff cuda_attn_L448_bfloat16 1.24 True
|
| 4381 |
+
xformers_meff cuda_attn_L512_bfloat16 1.24 True
|
| 4382 |
|
| 4383 |
GENERATING COMBINED VISUALIZATION
|
| 4384 |
|
|
|
|
| 4402 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4403 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4404 |
<div class="uv-logs-content" style="display: none;">
|
| 4405 |
+
Installed 37 packages in 199ms
|
| 4406 |
</div>
|
| 4407 |
</div>
|
| 4408 |
<div class="cell-artifacts">
|
|
|
|
| 4415 |
<rdf:RDF>
|
| 4416 |
<ns2:Work>
|
| 4417 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4418 |
+
<dc:date>2025-10-29T04:14:54.057236</dc:date>
|
| 4419 |
<dc:format>image/svg+xml</dc:format>
|
| 4420 |
<dc:creator>
|
| 4421 |
<ns2:Agent>
|
|
|
|
| 4525 |
<g id="matplotlib.axis_2">
|
| 4526 |
<g id="ytick_1">
|
| 4527 |
<g id="grid-y--2" class="grid grid-y">
|
| 4528 |
+
<path d="M 47.81 405.733213 L 835.361742 405.733213 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4529 |
</g>
|
| 4530 |
<g id="line2d_7">
|
| 4531 |
<defs>
|
| 4532 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4533 |
</defs>
|
| 4534 |
<g>
|
| 4535 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="405.733213" style="stroke: #000000; stroke-width: 0.8" />
|
| 4536 |
</g>
|
| 4537 |
</g>
|
| 4538 |
<g id="text_7">
|
| 4539 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="409.532432" transform="rotate(-0 40.81 409.532432)">1.0</text>
|
| 4540 |
</g>
|
| 4541 |
</g>
|
| 4542 |
<g id="ytick_2">
|
| 4543 |
<g id="grid-y--3" class="grid grid-y">
|
| 4544 |
+
<path d="M 47.81 344.636964 L 835.361742 344.636964 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4545 |
</g>
|
| 4546 |
<g id="line2d_8">
|
| 4547 |
<g>
|
| 4548 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="344.636964" style="stroke: #000000; stroke-width: 0.8" />
|
| 4549 |
</g>
|
| 4550 |
</g>
|
| 4551 |
<g id="text_8">
|
| 4552 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="348.436183" transform="rotate(-0 40.81 348.436183)">1.2</text>
|
| 4553 |
</g>
|
| 4554 |
</g>
|
| 4555 |
<g id="ytick_3">
|
| 4556 |
<g id="grid-y--4" class="grid grid-y">
|
| 4557 |
+
<path d="M 47.81 283.540715 L 835.361742 283.540715 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4558 |
</g>
|
| 4559 |
<g id="line2d_9">
|
| 4560 |
<g>
|
| 4561 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="283.540715" style="stroke: #000000; stroke-width: 0.8" />
|
| 4562 |
</g>
|
| 4563 |
</g>
|
| 4564 |
<g id="text_9">
|
| 4565 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.339933" transform="rotate(-0 40.81 287.339933)">1.4</text>
|
| 4566 |
</g>
|
| 4567 |
</g>
|
| 4568 |
<g id="ytick_4">
|
| 4569 |
<g id="grid-y--5" class="grid grid-y">
|
| 4570 |
+
<path d="M 47.81 222.444466 L 835.361742 222.444466 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4571 |
</g>
|
| 4572 |
<g id="line2d_10">
|
| 4573 |
<g>
|
| 4574 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="222.444466" style="stroke: #000000; stroke-width: 0.8" />
|
| 4575 |
</g>
|
| 4576 |
</g>
|
| 4577 |
<g id="text_10">
|
| 4578 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="226.243684" transform="rotate(-0 40.81 226.243684)">1.6</text>
|
| 4579 |
</g>
|
| 4580 |
</g>
|
| 4581 |
<g id="ytick_5">
|
| 4582 |
<g id="grid-y--6" class="grid grid-y">
|
| 4583 |
+
<path d="M 47.81 161.348216 L 835.361742 161.348216 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4584 |
</g>
|
| 4585 |
<g id="line2d_11">
|
| 4586 |
<g>
|
| 4587 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="161.348216" style="stroke: #000000; stroke-width: 0.8" />
|
| 4588 |
</g>
|
| 4589 |
</g>
|
| 4590 |
<g id="text_11">
|
| 4591 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="165.147435" transform="rotate(-0 40.81 165.147435)">1.8</text>
|
| 4592 |
</g>
|
| 4593 |
</g>
|
| 4594 |
<g id="ytick_6">
|
| 4595 |
<g id="grid-y--7" class="grid grid-y">
|
| 4596 |
+
<path d="M 47.81 100.251967 L 835.361742 100.251967 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4597 |
</g>
|
| 4598 |
<g id="line2d_12">
|
| 4599 |
<g>
|
| 4600 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="100.251967" style="stroke: #000000; stroke-width: 0.8" />
|
| 4601 |
</g>
|
| 4602 |
</g>
|
| 4603 |
<g id="text_12">
|
| 4604 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="104.051186" transform="rotate(-0 40.81 104.051186)">2.0</text>
|
| 4605 |
</g>
|
| 4606 |
</g>
|
| 4607 |
<g id="ytick_7">
|
| 4608 |
<g id="grid-y--8" class="grid grid-y">
|
| 4609 |
+
<path d="M 47.81 39.155718 L 835.361742 39.155718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4610 |
</g>
|
| 4611 |
<g id="line2d_13">
|
| 4612 |
<g>
|
| 4613 |
+
<use ns4:href="#m0fca2865ba" x="47.81" y="39.155718" style="stroke: #000000; stroke-width: 0.8" />
|
| 4614 |
</g>
|
| 4615 |
</g>
|
| 4616 |
<g id="text_13">
|
| 4617 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="42.954937" transform="rotate(-0 40.81 42.954937)">2.2</text>
|
| 4618 |
</g>
|
| 4619 |
</g>
|
| 4620 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4622 |
</g>
|
| 4623 |
</g>
|
| 4624 |
<g id="series--torch-flash-ma" class="series">
|
| 4625 |
+
<path d="M 83.607806 340.364503 L 226.799032 326.486185 L 369.990258 317.620813 L 513.181484 308.266672 L 656.37271 260.522092 L 799.563935 250.419827 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4626 |
<defs>
|
| 4627 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4628 |
</defs>
|
| 4629 |
<g clip-path="url(#p09feef2583)">
|
| 4630 |
+
<use ns4:href="#md7efaf3aec" x="83.607806" y="340.364503" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4631 |
+
<use ns4:href="#md7efaf3aec" x="226.799032" y="326.486185" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4632 |
+
<use ns4:href="#md7efaf3aec" x="369.990258" y="317.620813" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4633 |
+
<use ns4:href="#md7efaf3aec" x="513.181484" y="308.266672" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4634 |
+
<use ns4:href="#md7efaf3aec" x="656.37271" y="260.522092" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4635 |
+
<use ns4:href="#md7efaf3aec" x="799.563935" y="250.419827" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4636 |
</g>
|
| 4637 |
</g>
|
| 4638 |
<g id="series--torch-mem-eff" class="series">
|
| 4639 |
+
<path d="M 83.607806 150.170658 L 226.799032 128.245658 L 369.990258 112.32367 L 513.181484 87.053955 L 656.37271 71.037268 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4640 |
<defs>
|
| 4641 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4642 |
</defs>
|
| 4643 |
<g clip-path="url(#p09feef2583)">
|
| 4644 |
+
<use ns4:href="#m9b8c54d372" x="83.607806" y="150.170658" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4645 |
+
<use ns4:href="#m9b8c54d372" x="226.799032" y="128.245658" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4646 |
+
<use ns4:href="#m9b8c54d372" x="369.990258" y="112.32367" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4647 |
+
<use ns4:href="#m9b8c54d372" x="513.181484" y="87.053955" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4648 |
+
<use ns4:href="#m9b8c54d372" x="656.37271" y="71.037268" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4649 |
<use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4650 |
</g>
|
| 4651 |
</g>
|
| 4652 |
<g id="series--xformers-meff" class="series">
|
| 4653 |
+
<path d="M 83.607806 403.792796 L 226.799032 392.954016 L 369.990258 376.494687 L 513.181484 374.771467 L 656.37271 331.266966 L 799.563935 332.076491 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
|
| 4654 |
<defs>
|
| 4655 |
<path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
|
| 4656 |
</defs>
|
| 4657 |
<g clip-path="url(#p09feef2583)">
|
| 4658 |
+
<use ns4:href="#mc655281e0b" x="83.607806" y="403.792796" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4659 |
+
<use ns4:href="#mc655281e0b" x="226.799032" y="392.954016" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4660 |
+
<use ns4:href="#mc655281e0b" x="369.990258" y="376.494687" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4661 |
+
<use ns4:href="#mc655281e0b" x="513.181484" y="374.771467" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4662 |
+
<use ns4:href="#mc655281e0b" x="656.37271" y="331.266966" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4663 |
+
<use ns4:href="#mc655281e0b" x="799.563935" y="332.076491" style="fill: #2ca02c; stroke: #2ca02c" />
|
| 4664 |
</g>
|
| 4665 |
</g>
|
| 4666 |
<g id="series--hf-kernels-flash-attn" class="series">
|
| 4667 |
+
<path d="M 83.607806 420.124435 L 226.799032 406.579091 L 369.990258 388.286263 L 513.181484 382.130816 L 656.37271 337.645415 L 799.563935 338.021157 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
|
| 4668 |
<defs>
|
| 4669 |
<path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
|
| 4670 |
</defs>
|
| 4671 |
<g clip-path="url(#p09feef2583)">
|
| 4672 |
+
<use ns4:href="#m61c8040d7e" x="83.607806" y="420.124435" style="fill: #d62728; stroke: #d62728" />
|
| 4673 |
+
<use ns4:href="#m61c8040d7e" x="226.799032" y="406.579091" style="fill: #d62728; stroke: #d62728" />
|
| 4674 |
+
<use ns4:href="#m61c8040d7e" x="369.990258" y="388.286263" style="fill: #d62728; stroke: #d62728" />
|
| 4675 |
+
<use ns4:href="#m61c8040d7e" x="513.181484" y="382.130816" style="fill: #d62728; stroke: #d62728" />
|
| 4676 |
+
<use ns4:href="#m61c8040d7e" x="656.37271" y="337.645415" style="fill: #d62728; stroke: #d62728" />
|
| 4677 |
+
<use ns4:href="#m61c8040d7e" x="799.563935" y="338.021157" style="fill: #d62728; stroke: #d62728" />
|
| 4678 |
</g>
|
| 4679 |
</g>
|
| 4680 |
<g id="series--hf-kernels-flash-attn3" class="series">
|
| 4681 |
+
<path d="M 83.607806 428.387702 L 226.799032 415.233374 L 369.990258 396.467356 L 513.181484 398.1139 L 656.37271 345.939841 L 799.563935 352.101398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
|
| 4682 |
<defs>
|
| 4683 |
<path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
|
| 4684 |
</defs>
|
| 4685 |
<g clip-path="url(#p09feef2583)">
|
| 4686 |
<use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
|
| 4687 |
+
<use ns4:href="#m7cd35be9cc" x="226.799032" y="415.233374" style="fill: #9467bd; stroke: #9467bd" />
|
| 4688 |
+
<use ns4:href="#m7cd35be9cc" x="369.990258" y="396.467356" style="fill: #9467bd; stroke: #9467bd" />
|
| 4689 |
+
<use ns4:href="#m7cd35be9cc" x="513.181484" y="398.1139" style="fill: #9467bd; stroke: #9467bd" />
|
| 4690 |
+
<use ns4:href="#m7cd35be9cc" x="656.37271" y="345.939841" style="fill: #9467bd; stroke: #9467bd" />
|
| 4691 |
+
<use ns4:href="#m7cd35be9cc" x="799.563935" y="352.101398" style="fill: #9467bd; stroke: #9467bd" />
|
| 4692 |
</g>
|
| 4693 |
</g>
|
| 4694 |
<g id="patch_3">
|
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T04:14:34Z", "run": "fe58e781071b44039fe2ff8652618fab", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8066769999572898, "p50": 0.8187079999970592, "p90": 0.8202469999787354, "mean": 0.8162713999922744, "iqr": 0.004819999958272092, "raw_times": [0.8066769999572898, 0.8187079999970592, 0.8202980000078242, 0.8202469999787354, 0.8154270000204633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8278980000113734, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T04:14:34Z", "run": "fe58e781071b44039fe2ff8652618fab", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6742259999773523, "p50": 1.6819360000113193, "p90": 1.682725999955892, "mean": 1.6819601999827682, "iqr": 0.0012589999869305757, "raw_times": [1.6814669999689613, 1.6742259999773523, 1.6819360000113193, 1.6894460000003164, 1.682725999955892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6828459999942424, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T04:14:35Z", "run": "fe58e781071b44039fe2ff8652618fab", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6012950000003912, "p50": 1.6123539999739478, "p90": 1.612534000003052, "mean": 1.6096803999971598, "iqr": 0.005670000007285125, "raw_times": [1.6068639999957668, 1.612534000003052, 1.6123539999739478, 1.6012950000003912, 1.6153550000126415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6159039999479319, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T04:14:35Z", "run": "fe58e781071b44039fe2ff8652618fab", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.330611000023964, "p50": 3.334851999966304, "p90": 3.3351920000086466, "mean": 3.3337277999976322, "iqr": 0.003470000024208275, "raw_times": [3.3351920000086466, 3.330611000023964, 3.3317219999844383, 3.336262000004808, 3.334851999966304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.3335720000309266, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
|
layer_norm/impls/cells/benchmark.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
-
# "kernels",
|
| 7 |
# "kernels-benchmark-tools",
|
| 8 |
# ]
|
| 9 |
#
|
|
@@ -13,37 +12,15 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the layer norm kernel
|
| 19 |
-
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
B, S, D = x.shape
|
| 24 |
-
# The kernel expects [N, D] input; support beta (bias) if provided.
|
| 25 |
-
out = layer_norm_kernel.dropout_add_ln_fwd(
|
| 26 |
-
input=x.view(-1, D),
|
| 27 |
-
gamma=weight,
|
| 28 |
-
beta=bias,
|
| 29 |
-
rowscale=None,
|
| 30 |
-
colscale=None,
|
| 31 |
-
x0_subset=None,
|
| 32 |
-
z_subset=None,
|
| 33 |
-
dropout_p=0.0,
|
| 34 |
-
epsilon=eps,
|
| 35 |
-
rowscale_const=1.0,
|
| 36 |
-
z_numrows=S,
|
| 37 |
-
gen=None,
|
| 38 |
-
residual_in_fp32=False,
|
| 39 |
-
is_rms_norm=False,
|
| 40 |
-
)[0].view(B, S, D)
|
| 41 |
-
return out
|
| 42 |
|
| 43 |
|
| 44 |
run_benchmark(
|
| 45 |
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 46 |
-
impl_name="
|
| 47 |
-
impl_tags={"family": "
|
| 48 |
-
impl_func=
|
| 49 |
)
|
|
|
|
| 3 |
# dependencies = [
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
|
|
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
# ]
|
| 8 |
#
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
|
| 18 |
+
return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
run_benchmark(
|
| 22 |
kernel_type=KernelTypeEnum.LAYER_NORM,
|
| 23 |
+
impl_name="torch_layer_norm",
|
| 24 |
+
impl_tags={"family": "torch", "op": "layer_norm"},
|
| 25 |
+
impl_func=torch_layer_norm,
|
| 26 |
)
|
layer_norm/impls/hf_kernels_layer_norm.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3873 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3874 |
</span> |
|
| 3875 |
-
Cell: benchmark |
|
| 3876 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3877 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3878 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
-
hf_kernels_layer_norm
|
| 3947 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 1.
|
| 3948 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3949 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3950 |
-
Activity Buffer Request 36.
|
| 3951 |
-
aten::view 0.
|
| 3952 |
-
aten::empty 1.
|
| 3953 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 3954 |
-
cudaLaunchKernel 1.
|
| 3955 |
-
cudaDeviceSynchronize 53.
|
| 3956 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3957 |
-
Self CPU time total:
|
| 3958 |
-
Self CUDA time total: 2.
|
| 3959 |
|
| 3960 |
|
| 3961 |
|
|
@@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
|
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
-
hf_kernels_layer_norm 2.
|
| 3969 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 47.
|
| 3970 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3971 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3972 |
-
Activity Buffer Request 22.
|
| 3973 |
-
aten::view 0.19% 11.
|
| 3974 |
-
aten::empty 0.
|
| 3975 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08%
|
| 3976 |
-
cudaLaunchKernel 0.
|
| 3977 |
-
cudaDeviceSynchronize 73.
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
-
Self CPU time total: 6.
|
| 3980 |
-
Self CUDA time total: 4.
|
| 3981 |
|
| 3982 |
|
| 3983 |
|
|
@@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
-
hf_kernels_layer_norm
|
| 3991 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 3992 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3993 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3994 |
-
Activity Buffer Request
|
| 3995 |
-
aten::view 0.
|
| 3996 |
-
aten::empty 0.
|
| 3997 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08%
|
| 3998 |
-
cudaLaunchKernel 0.
|
| 3999 |
-
cudaDeviceSynchronize 73.
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
-
Self CPU time total: 6.
|
| 4002 |
-
Self CUDA time total: 4.
|
| 4003 |
|
| 4004 |
|
| 4005 |
|
|
@@ -4009,37 +4009,36 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
|
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4011 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
-
hf_kernels_layer_norm 1.
|
| 4013 |
-
_layer_norm_f8ec252::dropout_add_ln_fwd 0.
|
| 4014 |
-
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4015 |
-
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4016 |
-
Activity Buffer Request
|
| 4017 |
-
aten::view 0.
|
| 4018 |
-
aten::empty 0.
|
| 4019 |
-
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.
|
| 4020 |
-
cudaLaunchKernel 2.
|
| 4021 |
-
cudaDeviceSynchronize
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
-
Self CPU time total:
|
| 4024 |
-
Self CUDA time total: 9.
|
| 4025 |
|
| 4026 |
|
| 4027 |
impl wl p50(ms) ok
|
| 4028 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4029 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.
|
| 4030 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4031 |
-
hf_kernels_layer_norm LN_B16_S4096_D8192 3.
|
| 4032 |
</pre></div>
|
| 4033 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4034 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4035 |
<div class="uv-logs-content" style="display: none;">
|
| 4036 |
-
Installed
|
| 4037 |
</div>
|
| 4038 |
</div>
|
| 4039 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4040 |
-
Fetching 4 files:
|
| 4041 |
-
Fetching 4 files:
|
| 4042 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.56it/s]</div>
|
| 4043 |
<div class="cell-artifacts">
|
| 4044 |
<h4>Artifacts:</h4>
|
| 4045 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 3872 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3873 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3874 |
</span> |
|
| 3875 |
+
Cell: benchmark | 10.03s
|
| 3876 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3877 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3878 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3943 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3944 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3945 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3946 |
+
hf_kernels_layer_norm 5.27% 208.522us 46.60% 1.845ms 1.845ms 0.000us 0.00% 3.097ms 3.097ms 1
|
| 3947 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 1.74% 68.841us 40.71% 1.611ms 537.108us 2.361ms 100.00% 3.097ms 1.032ms 3
|
| 3948 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.06% 2.362ms 2.362ms 1
|
| 3949 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.361ms 100.00% 2.361ms 786.869us 3
|
| 3950 |
+
Activity Buffer Request 36.42% 1.442ms 36.42% 1.442ms 1.442ms 736.192us 31.19% 736.192us 736.192us 1
|
| 3951 |
+
aten::view 0.63% 24.853us 0.63% 24.853us 4.142us 0.000us 0.00% 0.000us 0.000us 6
|
| 3952 |
+
aten::empty 1.30% 51.300us 1.30% 51.300us 5.700us 0.000us 0.00% 0.000us 0.000us 9
|
| 3953 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 9.370us 0.24% 9.370us 3.123us 0.000us 0.00% 0.000us 0.000us 3
|
| 3954 |
+
cudaLaunchKernel 1.02% 40.192us 1.02% 40.192us 13.397us 0.000us 0.00% 0.000us 0.000us 3
|
| 3955 |
+
cudaDeviceSynchronize 53.40% 2.114ms 53.40% 2.114ms 2.114ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3956 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3957 |
+
Self CPU time total: 3.958ms
|
| 3958 |
+
Self CUDA time total: 2.361ms
|
| 3959 |
|
| 3960 |
|
| 3961 |
|
|
|
|
| 3965 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3966 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3967 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3968 |
+
hf_kernels_layer_norm 2.10% 132.443us 26.89% 1.698ms 1.698ms 0.000us 0.00% 6.359ms 6.359ms 1
|
| 3969 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 47.272us 24.61% 1.554ms 517.847us 4.798ms 100.00% 6.359ms 2.120ms 3
|
| 3970 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.799ms 100.03% 4.799ms 4.799ms 1
|
| 3971 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.798ms 100.00% 4.798ms 1.599ms 3
|
| 3972 |
+
Activity Buffer Request 22.80% 1.439ms 22.80% 1.439ms 1.439ms 1.561ms 32.53% 1.561ms 1.561ms 1
|
| 3973 |
+
aten::view 0.19% 11.750us 0.19% 11.750us 1.958us 0.000us 0.00% 0.000us 0.000us 6
|
| 3974 |
+
aten::empty 0.50% 31.791us 0.50% 31.791us 3.532us 0.000us 0.00% 0.000us 0.000us 9
|
| 3975 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.299us 0.08% 5.299us 1.766us 0.000us 0.00% 0.000us 0.000us 3
|
| 3976 |
+
cudaLaunchKernel 0.47% 29.920us 0.47% 29.920us 9.973us 0.000us 0.00% 0.000us 0.000us 3
|
| 3977 |
+
cudaDeviceSynchronize 73.11% 4.615ms 73.11% 4.615ms 4.615ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3978 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3979 |
+
Self CPU time total: 6.313ms
|
| 3980 |
+
Self CUDA time total: 4.798ms
|
| 3981 |
|
| 3982 |
|
| 3983 |
|
|
|
|
| 3987 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3988 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3989 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3990 |
+
hf_kernels_layer_norm 1.81% 113.352us 26.62% 1.665ms 1.665ms 0.000us 0.00% 6.298ms 6.298ms 1
|
| 3991 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.74% 46.460us 24.63% 1.540ms 513.314us 4.755ms 100.00% 6.298ms 2.099ms 3
|
| 3992 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.757ms 100.03% 4.757ms 4.757ms 1
|
| 3993 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.755ms 100.00% 4.755ms 1.585ms 3
|
| 3994 |
+
Activity Buffer Request 22.82% 1.427ms 22.82% 1.427ms 1.427ms 1.543ms 32.44% 1.543ms 1.543ms 1
|
| 3995 |
+
aten::view 0.19% 11.631us 0.19% 11.631us 1.939us 0.000us 0.00% 0.000us 0.000us 6
|
| 3996 |
+
aten::empty 0.51% 31.740us 0.51% 31.740us 3.527us 0.000us 0.00% 0.000us 0.000us 9
|
| 3997 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.941us 0.08% 4.941us 1.647us 0.000us 0.00% 0.000us 0.000us 3
|
| 3998 |
+
cudaLaunchKernel 0.48% 29.911us 0.48% 29.911us 9.970us 0.000us 0.00% 0.000us 0.000us 3
|
| 3999 |
+
cudaDeviceSynchronize 73.38% 4.589ms 73.38% 4.589ms 4.589ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4000 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4001 |
+
Self CPU time total: 6.253ms
|
| 4002 |
+
Self CUDA time total: 4.755ms
|
| 4003 |
|
| 4004 |
|
| 4005 |
|
|
|
|
| 4009 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4010 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4011 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4012 |
+
hf_kernels_layer_norm 1.13% 113.823us 5.68% 571.343us 571.343us 0.000us 0.00% 12.836ms 12.836ms 1
|
| 4013 |
+
_layer_norm_f8ec252::dropout_add_ln_fwd 0.45% 45.540us 4.43% 445.300us 148.433us 9.651ms 100.00% 12.836ms 4.279ms 3
|
| 4014 |
+
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.652ms 100.01% 9.652ms 9.652ms 1
|
| 4015 |
+
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.651ms 100.00% 9.651ms 3.217ms 3
|
| 4016 |
+
Activity Buffer Request 1.18% 119.172us 1.18% 119.172us 119.172us 3.185ms 33.00% 3.185ms 3.185ms 1
|
| 4017 |
+
aten::view 0.12% 12.220us 0.12% 12.220us 2.037us 0.000us 0.00% 0.000us 0.000us 6
|
| 4018 |
+
aten::empty 0.31% 31.382us 0.31% 31.382us 3.487us 0.000us 0.00% 0.000us 0.000us 9
|
| 4019 |
+
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 4.801us 0.05% 4.801us 1.600us 0.000us 0.00% 0.000us 0.000us 3
|
| 4020 |
+
cudaLaunchKernel 2.43% 244.405us 2.43% 244.405us 81.468us 0.000us 0.00% 0.000us 0.000us 3
|
| 4021 |
+
cudaDeviceSynchronize 94.32% 9.488ms 94.32% 9.488ms 9.488ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4022 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4023 |
+
Self CPU time total: 10.060ms
|
| 4024 |
+
Self CUDA time total: 9.651ms
|
| 4025 |
|
| 4026 |
|
| 4027 |
impl wl p50(ms) ok
|
| 4028 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4029 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4030 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4031 |
+
hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
|
| 4032 |
</pre></div>
|
| 4033 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4034 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4035 |
<div class="uv-logs-content" style="display: none;">
|
| 4036 |
+
Installed 52 packages in 214ms
|
| 4037 |
</div>
|
| 4038 |
</div>
|
| 4039 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4040 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.01it/s]
|
| 4041 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.02it/s]</div>
|
|
|
|
| 4042 |
<div class="cell-artifacts">
|
| 4043 |
<h4>Artifacts:</h4>
|
| 4044 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/impls/torch_layer_norm.html
CHANGED
|
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: nv | 0.
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
-
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
-
| N/A
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
@@ -3918,9 +3918,9 @@ Cell: nv | 0.23s
|
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
-
<span id="uv-indicator-benchmark"
|
| 3922 |
</span> |
|
| 3923 |
-
Cell: benchmark |
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3968,19 +3968,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
|
|
| 3968 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3969 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
-
torch_layer_norm 3.
|
| 3972 |
-
aten::layer_norm 0.41%
|
| 3973 |
-
aten::native_layer_norm 2.
|
| 3974 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3975 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.
|
| 3976 |
-
Activity Buffer Request 36
|
| 3977 |
-
aten::empty 1.
|
| 3978 |
-
cudaLaunchKernel 1.
|
| 3979 |
-
aten::view 0.19% 7.
|
| 3980 |
-
cudaDeviceSynchronize 54.
|
| 3981 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3982 |
-
Self CPU time total: 3.
|
| 3983 |
-
Self CUDA time total: 2.
|
| 3984 |
|
| 3985 |
|
| 3986 |
|
|
@@ -3990,19 +3990,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
|
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3992 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3993 |
-
torch_layer_norm 1.
|
| 3994 |
-
aten::layer_norm 0.
|
| 3995 |
-
aten::native_layer_norm 0.84% 53.
|
| 3996 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3997 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 3998 |
-
Activity Buffer Request 22.
|
| 3999 |
-
aten::empty 0.44% 28.
|
| 4000 |
-
cudaLaunchKernel 0.
|
| 4001 |
-
aten::view 0.
|
| 4002 |
-
cudaDeviceSynchronize 74.
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
-
Self CPU time total: 6.
|
| 4005 |
-
Self CUDA time total: 4.
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
@@ -4012,19 +4012,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
-
torch_layer_norm 1.
|
| 4016 |
-
aten::layer_norm 0.
|
| 4017 |
-
aten::native_layer_norm 0.
|
| 4018 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4019 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.
|
| 4020 |
-
Activity Buffer Request 22.
|
| 4021 |
-
aten::empty 0.
|
| 4022 |
-
cudaLaunchKernel 0.
|
| 4023 |
-
aten::view 0.06%
|
| 4024 |
-
cudaDeviceSynchronize 73.
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
-
Self CPU time total: 6.
|
| 4027 |
-
Self CUDA time total: 4.
|
| 4028 |
|
| 4029 |
|
| 4030 |
|
|
@@ -4034,33 +4034,27 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
|
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
-
torch_layer_norm 0.
|
| 4038 |
-
aten::layer_norm 0.
|
| 4039 |
-
aten::native_layer_norm 0.
|
| 4040 |
-
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4041 |
-
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.
|
| 4042 |
-
Activity Buffer Request
|
| 4043 |
-
aten::empty 0.
|
| 4044 |
-
cudaLaunchKernel 2.
|
| 4045 |
-
aten::view 0.
|
| 4046 |
-
cudaDeviceSynchronize
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
-
Self CPU time total: 11.
|
| 4049 |
-
Self CUDA time total: 9.
|
| 4050 |
|
| 4051 |
|
| 4052 |
impl wl p50(ms) ok
|
| 4053 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4054 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4055 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4056 |
-
torch_layer_norm LN_B16_S4096_D8192 3.
|
| 4057 |
</pre></div>
|
| 4058 |
-
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4059 |
-
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4060 |
-
<div class="uv-logs-content" style="display: none;">
|
| 4061 |
-
Installed 37 packages in 222ms
|
| 4062 |
-
</div>
|
| 4063 |
-
</div>
|
| 4064 |
<div class="cell-artifacts">
|
| 4065 |
<h4>Artifacts:</h4>
|
| 4066 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
|
|
|
| 3871 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: nv | 0.26s
|
| 3875 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3877 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3887 |
</div>
|
| 3888 |
</div>
|
| 3889 |
<div id="output-nv" class="cell-output">
|
| 3890 |
+
<div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 04:14:31 2025
|
| 3891 |
+-----------------------------------------------------------------------------------------+
|
| 3892 |
| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
|
| 3893 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3896 |
| | | MIG M. |
|
| 3897 |
|=========================================+========================+======================|
|
| 3898 |
| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
|
| 3899 |
+
| N/A 37C P0 140W / 350W | 0MiB / 46068MiB | 33% Default |
|
| 3900 |
| | | N/A |
|
| 3901 |
+-----------------------------------------+------------------------+----------------------+
|
| 3902 |
|
|
|
|
| 3918 |
<span class="collapse-indicators">
|
| 3919 |
<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
|
| 3920 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3921 |
+
<span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3922 |
</span> |
|
| 3923 |
+
Cell: benchmark | 3.83s
|
| 3924 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3925 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3926 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3968 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3969 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3970 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3971 |
+
torch_layer_norm 3.57% 138.883us 45.78% 1.780ms 1.780ms 0.000us 0.00% 3.022ms 3.022ms 1
|
| 3972 |
+
aten::layer_norm 0.41% 16.121us 42.21% 1.641ms 546.912us 0.000us 0.00% 3.022ms 1.007ms 3
|
| 3973 |
+
aten::native_layer_norm 2.00% 77.621us 41.80% 1.625ms 541.538us 2.315ms 100.00% 3.022ms 1.007ms 3
|
| 3974 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.317ms 100.06% 2.317ms 2.317ms 1
|
| 3975 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.315ms 100.00% 2.315ms 771.810us 3
|
| 3976 |
+
Activity Buffer Request 37.36% 1.452ms 37.36% 1.452ms 1.452ms 706.306us 30.50% 706.306us 706.306us 1
|
| 3977 |
+
aten::empty 1.15% 44.871us 1.15% 44.871us 4.986us 0.000us 0.00% 0.000us 0.000us 9
|
| 3978 |
+
cudaLaunchKernel 1.10% 42.752us 1.10% 42.752us 14.251us 0.000us 0.00% 0.000us 0.000us 3
|
| 3979 |
+
aten::view 0.19% 7.379us 0.19% 7.379us 1.230us 0.000us 0.00% 0.000us 0.000us 6
|
| 3980 |
+
cudaDeviceSynchronize 54.22% 2.107ms 54.22% 2.107ms 2.107ms 0.000us 0.00% 0.000us 0.000us 1
|
| 3981 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3982 |
+
Self CPU time total: 3.887ms
|
| 3983 |
+
Self CUDA time total: 2.315ms
|
| 3984 |
|
| 3985 |
|
| 3986 |
|
|
|
|
| 3990 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3991 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 3992 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 3993 |
+
torch_layer_norm 1.48% 94.373us 25.56% 1.635ms 1.635ms 0.000us 0.00% 6.500ms 6.500ms 1
|
| 3994 |
+
aten::layer_norm 0.15% 9.600us 24.08% 1.541ms 513.581us 0.000us 0.00% 6.500ms 2.167ms 3
|
| 3995 |
+
aten::native_layer_norm 0.84% 53.630us 23.93% 1.531ms 510.381us 4.901ms 100.00% 6.500ms 2.167ms 3
|
| 3996 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.903ms 100.03% 4.903ms 4.903ms 1
|
| 3997 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.901ms 100.00% 4.901ms 1.634ms 3
|
| 3998 |
+
Activity Buffer Request 22.17% 1.418ms 22.17% 1.418ms 1.418ms 1.599ms 32.62% 1.599ms 1.599ms 1
|
| 3999 |
+
aten::empty 0.44% 28.023us 0.44% 28.023us 3.114us 0.000us 0.00% 0.000us 0.000us 9
|
| 4000 |
+
cudaLaunchKernel 0.43% 27.290us 0.43% 27.290us 9.097us 0.000us 0.00% 0.000us 0.000us 3
|
| 4001 |
+
aten::view 0.06% 3.930us 0.06% 3.930us 0.655us 0.000us 0.00% 0.000us 0.000us 6
|
| 4002 |
+
cudaDeviceSynchronize 74.44% 4.763ms 74.44% 4.763ms 4.763ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4003 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4004 |
+
Self CPU time total: 6.398ms
|
| 4005 |
+
Self CUDA time total: 4.901ms
|
| 4006 |
|
| 4007 |
|
| 4008 |
|
|
|
|
| 4012 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4013 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4014 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4015 |
+
torch_layer_norm 1.50% 93.544us 26.20% 1.631ms 1.631ms 0.000us 0.00% 6.249ms 6.249ms 1
|
| 4016 |
+
aten::layer_norm 0.18% 11.099us 24.70% 1.537ms 512.487us 0.000us 0.00% 6.249ms 2.083ms 3
|
| 4017 |
+
aten::native_layer_norm 0.84% 52.492us 24.52% 1.526ms 508.788us 4.730ms 100.00% 6.249ms 2.083ms 3
|
| 4018 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.731ms 100.03% 4.731ms 4.731ms 1
|
| 4019 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.730ms 100.00% 4.730ms 1.577ms 3
|
| 4020 |
+
Activity Buffer Request 22.71% 1.414ms 22.71% 1.414ms 1.414ms 1.519ms 32.12% 1.519ms 1.519ms 1
|
| 4021 |
+
aten::empty 0.45% 28.140us 0.45% 28.140us 3.127us 0.000us 0.00% 0.000us 0.000us 9
|
| 4022 |
+
cudaLaunchKernel 0.45% 28.230us 0.45% 28.230us 9.410us 0.000us 0.00% 0.000us 0.000us 3
|
| 4023 |
+
aten::view 0.06% 3.950us 0.06% 3.950us 0.658us 0.000us 0.00% 0.000us 0.000us 6
|
| 4024 |
+
cudaDeviceSynchronize 73.80% 4.594ms 73.80% 4.594ms 4.594ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4025 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4026 |
+
Self CPU time total: 6.225ms
|
| 4027 |
+
Self CUDA time total: 4.730ms
|
| 4028 |
|
| 4029 |
|
| 4030 |
|
|
|
|
| 4034 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4035 |
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
|
| 4036 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4037 |
+
torch_layer_norm 0.92% 106.312us 16.40% 1.902ms 1.902ms 0.000us 0.00% 13.074ms 13.074ms 1
|
| 4038 |
+
aten::layer_norm 0.08% 9.291us 15.49% 1.795ms 598.420us 0.000us 0.00% 13.074ms 4.358ms 3
|
| 4039 |
+
aten::native_layer_norm 0.48% 55.080us 15.41% 1.786ms 595.323us 9.836ms 100.00% 13.074ms 4.358ms 3
|
| 4040 |
+
torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.838ms 100.01% 9.838ms 9.838ms 1
|
| 4041 |
+
void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.836ms 100.00% 9.836ms 3.279ms 3
|
| 4042 |
+
Activity Buffer Request 12.48% 1.446ms 12.48% 1.446ms 1.446ms 3.238ms 32.91% 3.238ms 3.238ms 1
|
| 4043 |
+
aten::empty 0.25% 29.330us 0.25% 29.330us 3.259us 0.000us 0.00% 0.000us 0.000us 9
|
| 4044 |
+
cudaLaunchKernel 2.17% 251.116us 2.17% 251.116us 83.705us 0.000us 0.00% 0.000us 0.000us 3
|
| 4045 |
+
aten::view 0.03% 3.981us 0.03% 3.981us 0.663us 0.000us 0.00% 0.000us 0.000us 6
|
| 4046 |
+
cudaDeviceSynchronize 83.60% 9.692ms 83.60% 9.692ms 9.692ms 0.000us 0.00% 0.000us 0.000us 1
|
| 4047 |
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
|
| 4048 |
+
Self CPU time total: 11.593ms
|
| 4049 |
+
Self CUDA time total: 9.836ms
|
| 4050 |
|
| 4051 |
|
| 4052 |
impl wl p50(ms) ok
|
| 4053 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4054 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4055 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4056 |
+
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
| 4057 |
</pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4058 |
<div class="cell-artifacts">
|
| 4059 |
<h4>Artifacts:</h4>
|
| 4060 |
<a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
|
layer_norm/results/artifacts/combine/latency.svg
CHANGED
|
|
|
|
layer_norm/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -3956,70 +3956,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3956 |
<g id="matplotlib.axis_2">
|
| 3957 |
<g id="ytick_1">
|
| 3958 |
<g id="grid-y--2" class="grid grid-y">
|
| 3959 |
-
<path d="M 47.72 409.
|
| 3960 |
</g>
|
| 3961 |
<g id="line2d_5">
|
| 3962 |
<defs>
|
| 3963 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3964 |
</defs>
|
| 3965 |
<g>
|
| 3966 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="409.
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="text_5">
|
| 3970 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.
|
| 3971 |
</g>
|
| 3972 |
</g>
|
| 3973 |
<g id="ytick_2">
|
| 3974 |
<g id="grid-y--3" class="grid grid-y">
|
| 3975 |
-
<path d="M 47.72 331.
|
| 3976 |
</g>
|
| 3977 |
<g id="line2d_6">
|
| 3978 |
<g>
|
| 3979 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="331.
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="text_6">
|
| 3983 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="ytick_3">
|
| 3987 |
<g id="grid-y--4" class="grid grid-y">
|
| 3988 |
-
<path d="M 47.72
|
| 3989 |
</g>
|
| 3990 |
<g id="line2d_7">
|
| 3991 |
<g>
|
| 3992 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_4">
|
| 4000 |
<g id="grid-y--5" class="grid grid-y">
|
| 4001 |
-
<path d="M 47.72
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_5">
|
| 4013 |
<g id="grid-y--6" class="grid grid-y">
|
| 4014 |
-
<path d="M 47.72
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="label--y" class="ylabel">
|
|
@@ -4027,27 +4027,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="series--torch-layer-norm" class="series">
|
| 4030 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4031 |
<defs>
|
| 4032 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4033 |
</defs>
|
| 4034 |
<g clip-path="url(#p2214f54723)">
|
| 4035 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4036 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4037 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4038 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4039 |
</g>
|
| 4040 |
</g>
|
| 4041 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4042 |
-
<path d="M 83.741924 435.
|
| 4043 |
<defs>
|
| 4044 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4045 |
</defs>
|
| 4046 |
<g clip-path="url(#p2214f54723)">
|
| 4047 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="435.
|
| 4048 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="
|
| 4049 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="308.
|
| 4050 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.
|
| 4051 |
</g>
|
| 4052 |
</g>
|
| 4053 |
<g id="patch_3">
|
|
@@ -4105,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4105 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4106 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4107 |
</span> |
|
| 4108 |
-
Cell: combine | 4.
|
| 4109 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4110 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4111 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4193,13 +4193,13 @@ COMBINED BENCHMARK SUMMARY
|
|
| 4193 |
|
| 4194 |
impl wl p50(ms) ok
|
| 4195 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4196 |
-
hf_kernels_layer_norm LN_B16_S2048_D8192 1.
|
| 4197 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4198 |
-
hf_kernels_layer_norm LN_B16_S4096_D8192 3.
|
| 4199 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4200 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4201 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4202 |
-
torch_layer_norm LN_B16_S4096_D8192 3.
|
| 4203 |
|
| 4204 |
GENERATING COMBINED VISUALIZATION
|
| 4205 |
|
|
@@ -4219,7 +4219,7 @@ Implementations included:
|
|
| 4219 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4220 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4221 |
<div class="uv-logs-content" style="display: none;">
|
| 4222 |
-
Installed 37 packages in
|
| 4223 |
</div>
|
| 4224 |
</div>
|
| 4225 |
<div class="cell-artifacts">
|
|
@@ -4232,7 +4232,7 @@ Installed 37 packages in 195ms
|
|
| 4232 |
<rdf:RDF>
|
| 4233 |
<ns2:Work>
|
| 4234 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4235 |
-
<dc:date>2025-10-
|
| 4236 |
<dc:format>image/svg+xml</dc:format>
|
| 4237 |
<dc:creator>
|
| 4238 |
<ns2:Agent>
|
|
@@ -4316,70 +4316,70 @@ Installed 37 packages in 195ms
|
|
| 4316 |
<g id="matplotlib.axis_2">
|
| 4317 |
<g id="ytick_1">
|
| 4318 |
<g id="grid-y--2" class="grid grid-y">
|
| 4319 |
-
<path d="M 47.72 409.
|
| 4320 |
</g>
|
| 4321 |
<g id="line2d_5">
|
| 4322 |
<defs>
|
| 4323 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4324 |
</defs>
|
| 4325 |
<g>
|
| 4326 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="409.
|
| 4327 |
</g>
|
| 4328 |
</g>
|
| 4329 |
<g id="text_5">
|
| 4330 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.
|
| 4331 |
</g>
|
| 4332 |
</g>
|
| 4333 |
<g id="ytick_2">
|
| 4334 |
<g id="grid-y--3" class="grid grid-y">
|
| 4335 |
-
<path d="M 47.72 331.
|
| 4336 |
</g>
|
| 4337 |
<g id="line2d_6">
|
| 4338 |
<g>
|
| 4339 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="331.
|
| 4340 |
</g>
|
| 4341 |
</g>
|
| 4342 |
<g id="text_6">
|
| 4343 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="ytick_3">
|
| 4347 |
<g id="grid-y--4" class="grid grid-y">
|
| 4348 |
-
<path d="M 47.72
|
| 4349 |
</g>
|
| 4350 |
<g id="line2d_7">
|
| 4351 |
<g>
|
| 4352 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4353 |
</g>
|
| 4354 |
</g>
|
| 4355 |
<g id="text_7">
|
| 4356 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="ytick_4">
|
| 4360 |
<g id="grid-y--5" class="grid grid-y">
|
| 4361 |
-
<path d="M 47.72
|
| 4362 |
</g>
|
| 4363 |
<g id="line2d_8">
|
| 4364 |
<g>
|
| 4365 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4366 |
</g>
|
| 4367 |
</g>
|
| 4368 |
<g id="text_8">
|
| 4369 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="ytick_5">
|
| 4373 |
<g id="grid-y--6" class="grid grid-y">
|
| 4374 |
-
<path d="M 47.72
|
| 4375 |
</g>
|
| 4376 |
<g id="line2d_9">
|
| 4377 |
<g>
|
| 4378 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4379 |
</g>
|
| 4380 |
</g>
|
| 4381 |
<g id="text_9">
|
| 4382 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="label--y" class="ylabel">
|
|
@@ -4387,27 +4387,27 @@ Installed 37 packages in 195ms
|
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="series--torch-layer-norm" class="series">
|
| 4390 |
-
<path d="M 83.741924 437.689571 L 323.888085
|
| 4391 |
<defs>
|
| 4392 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4393 |
</defs>
|
| 4394 |
<g clip-path="url(#p2214f54723)">
|
| 4395 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4396 |
-
<use ns4:href="#md7efaf3aec" x="323.888085" y="
|
| 4397 |
-
<use ns4:href="#md7efaf3aec" x="564.034245" y="
|
| 4398 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4399 |
</g>
|
| 4400 |
</g>
|
| 4401 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4402 |
-
<path d="M 83.741924 435.
|
| 4403 |
<defs>
|
| 4404 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4405 |
</defs>
|
| 4406 |
<g clip-path="url(#p2214f54723)">
|
| 4407 |
-
<use ns4:href="#m9b8c54d372" x="83.741924" y="435.
|
| 4408 |
-
<use ns4:href="#m9b8c54d372" x="323.888085" y="
|
| 4409 |
-
<use ns4:href="#m9b8c54d372" x="564.034245" y="308.
|
| 4410 |
-
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.
|
| 4411 |
</g>
|
| 4412 |
</g>
|
| 4413 |
<g id="patch_3">
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T04:14:58.377658</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 3956 |
<g id="matplotlib.axis_2">
|
| 3957 |
<g id="ytick_1">
|
| 3958 |
<g id="grid-y--2" class="grid grid-y">
|
| 3959 |
+
<path d="M 47.72 409.499615 L 840.20233 409.499615 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3960 |
</g>
|
| 3961 |
<g id="line2d_5">
|
| 3962 |
<defs>
|
| 3963 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 3964 |
</defs>
|
| 3965 |
<g>
|
| 3966 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.499615" style="stroke: #000000; stroke-width: 0.8" />
|
| 3967 |
</g>
|
| 3968 |
</g>
|
| 3969 |
<g id="text_5">
|
| 3970 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.298834" transform="rotate(-0 40.72 413.298834)">1.0</text>
|
| 3971 |
</g>
|
| 3972 |
</g>
|
| 3973 |
<g id="ytick_2">
|
| 3974 |
<g id="grid-y--3" class="grid grid-y">
|
| 3975 |
+
<path d="M 47.72 331.752234 L 840.20233 331.752234 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3976 |
</g>
|
| 3977 |
<g id="line2d_6">
|
| 3978 |
<g>
|
| 3979 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.752234" style="stroke: #000000; stroke-width: 0.8" />
|
| 3980 |
</g>
|
| 3981 |
</g>
|
| 3982 |
<g id="text_6">
|
| 3983 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.551453" transform="rotate(-0 40.72 335.551453)">1.5</text>
|
| 3984 |
</g>
|
| 3985 |
</g>
|
| 3986 |
<g id="ytick_3">
|
| 3987 |
<g id="grid-y--4" class="grid grid-y">
|
| 3988 |
+
<path d="M 47.72 254.004854 L 840.20233 254.004854 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 3989 |
</g>
|
| 3990 |
<g id="line2d_7">
|
| 3991 |
<g>
|
| 3992 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="254.004854" style="stroke: #000000; stroke-width: 0.8" />
|
| 3993 |
</g>
|
| 3994 |
</g>
|
| 3995 |
<g id="text_7">
|
| 3996 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.804072" transform="rotate(-0 40.72 257.804072)">2.0</text>
|
| 3997 |
</g>
|
| 3998 |
</g>
|
| 3999 |
<g id="ytick_4">
|
| 4000 |
<g id="grid-y--5" class="grid grid-y">
|
| 4001 |
+
<path d="M 47.72 176.257473 L 840.20233 176.257473 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4002 |
</g>
|
| 4003 |
<g id="line2d_8">
|
| 4004 |
<g>
|
| 4005 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="176.257473" style="stroke: #000000; stroke-width: 0.8" />
|
| 4006 |
</g>
|
| 4007 |
</g>
|
| 4008 |
<g id="text_8">
|
| 4009 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="180.056692" transform="rotate(-0 40.72 180.056692)">2.5</text>
|
| 4010 |
</g>
|
| 4011 |
</g>
|
| 4012 |
<g id="ytick_5">
|
| 4013 |
<g id="grid-y--6" class="grid grid-y">
|
| 4014 |
+
<path d="M 47.72 98.510092 L 840.20233 98.510092 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4015 |
</g>
|
| 4016 |
<g id="line2d_9">
|
| 4017 |
<g>
|
| 4018 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="98.510092" style="stroke: #000000; stroke-width: 0.8" />
|
| 4019 |
</g>
|
| 4020 |
</g>
|
| 4021 |
<g id="text_9">
|
| 4022 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="102.309311" transform="rotate(-0 40.72 102.309311)">3.0</text>
|
| 4023 |
</g>
|
| 4024 |
</g>
|
| 4025 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4027 |
</g>
|
| 4028 |
</g>
|
| 4029 |
<g id="series--torch-layer-norm" class="series">
|
| 4030 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.46214 L 564.034245 314.281776 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4031 |
<defs>
|
| 4032 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4033 |
</defs>
|
| 4034 |
<g clip-path="url(#p2214f54723)">
|
| 4035 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4036 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.46214" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4037 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.281776" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4038 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4039 |
</g>
|
| 4040 |
</g>
|
| 4041 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4042 |
+
<path d="M 83.741924 435.38374 L 323.888085 308.428798 L 564.034245 308.2251 L 804.180406 56.574866 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4043 |
<defs>
|
| 4044 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4045 |
</defs>
|
| 4046 |
<g clip-path="url(#p2214f54723)">
|
| 4047 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="435.38374" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4048 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="308.428798" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4049 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="308.2251" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4050 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.574866" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4051 |
</g>
|
| 4052 |
</g>
|
| 4053 |
<g id="patch_3">
|
|
|
|
| 4105 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4106 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4107 |
</span> |
|
| 4108 |
+
Cell: combine | 4.24s
|
| 4109 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4110 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4111 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4193 |
|
| 4194 |
impl wl p50(ms) ok
|
| 4195 |
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
|
| 4196 |
+
hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
|
| 4197 |
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
|
| 4198 |
+
hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
|
| 4199 |
torch_layer_norm LN_B16_S2048_D4096 0.82 True
|
| 4200 |
torch_layer_norm LN_B16_S2048_D8192 1.68 True
|
| 4201 |
torch_layer_norm LN_B16_S4096_D4096 1.61 True
|
| 4202 |
+
torch_layer_norm LN_B16_S4096_D8192 3.33 True
|
| 4203 |
|
| 4204 |
GENERATING COMBINED VISUALIZATION
|
| 4205 |
|
|
|
|
| 4219 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4220 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4221 |
<div class="uv-logs-content" style="display: none;">
|
| 4222 |
+
Installed 37 packages in 227ms
|
| 4223 |
</div>
|
| 4224 |
</div>
|
| 4225 |
<div class="cell-artifacts">
|
|
|
|
| 4232 |
<rdf:RDF>
|
| 4233 |
<ns2:Work>
|
| 4234 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4235 |
+
<dc:date>2025-10-29T04:14:58.377658</dc:date>
|
| 4236 |
<dc:format>image/svg+xml</dc:format>
|
| 4237 |
<dc:creator>
|
| 4238 |
<ns2:Agent>
|
|
|
|
| 4316 |
<g id="matplotlib.axis_2">
|
| 4317 |
<g id="ytick_1">
|
| 4318 |
<g id="grid-y--2" class="grid grid-y">
|
| 4319 |
+
<path d="M 47.72 409.499615 L 840.20233 409.499615 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4320 |
</g>
|
| 4321 |
<g id="line2d_5">
|
| 4322 |
<defs>
|
| 4323 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4324 |
</defs>
|
| 4325 |
<g>
|
| 4326 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="409.499615" style="stroke: #000000; stroke-width: 0.8" />
|
| 4327 |
</g>
|
| 4328 |
</g>
|
| 4329 |
<g id="text_5">
|
| 4330 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.298834" transform="rotate(-0 40.72 413.298834)">1.0</text>
|
| 4331 |
</g>
|
| 4332 |
</g>
|
| 4333 |
<g id="ytick_2">
|
| 4334 |
<g id="grid-y--3" class="grid grid-y">
|
| 4335 |
+
<path d="M 47.72 331.752234 L 840.20233 331.752234 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4336 |
</g>
|
| 4337 |
<g id="line2d_6">
|
| 4338 |
<g>
|
| 4339 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="331.752234" style="stroke: #000000; stroke-width: 0.8" />
|
| 4340 |
</g>
|
| 4341 |
</g>
|
| 4342 |
<g id="text_6">
|
| 4343 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.551453" transform="rotate(-0 40.72 335.551453)">1.5</text>
|
| 4344 |
</g>
|
| 4345 |
</g>
|
| 4346 |
<g id="ytick_3">
|
| 4347 |
<g id="grid-y--4" class="grid grid-y">
|
| 4348 |
+
<path d="M 47.72 254.004854 L 840.20233 254.004854 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4349 |
</g>
|
| 4350 |
<g id="line2d_7">
|
| 4351 |
<g>
|
| 4352 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="254.004854" style="stroke: #000000; stroke-width: 0.8" />
|
| 4353 |
</g>
|
| 4354 |
</g>
|
| 4355 |
<g id="text_7">
|
| 4356 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.804072" transform="rotate(-0 40.72 257.804072)">2.0</text>
|
| 4357 |
</g>
|
| 4358 |
</g>
|
| 4359 |
<g id="ytick_4">
|
| 4360 |
<g id="grid-y--5" class="grid grid-y">
|
| 4361 |
+
<path d="M 47.72 176.257473 L 840.20233 176.257473 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4362 |
</g>
|
| 4363 |
<g id="line2d_8">
|
| 4364 |
<g>
|
| 4365 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="176.257473" style="stroke: #000000; stroke-width: 0.8" />
|
| 4366 |
</g>
|
| 4367 |
</g>
|
| 4368 |
<g id="text_8">
|
| 4369 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="180.056692" transform="rotate(-0 40.72 180.056692)">2.5</text>
|
| 4370 |
</g>
|
| 4371 |
</g>
|
| 4372 |
<g id="ytick_5">
|
| 4373 |
<g id="grid-y--6" class="grid grid-y">
|
| 4374 |
+
<path d="M 47.72 98.510092 L 840.20233 98.510092 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4375 |
</g>
|
| 4376 |
<g id="line2d_9">
|
| 4377 |
<g>
|
| 4378 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="98.510092" style="stroke: #000000; stroke-width: 0.8" />
|
| 4379 |
</g>
|
| 4380 |
</g>
|
| 4381 |
<g id="text_9">
|
| 4382 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="102.309311" transform="rotate(-0 40.72 102.309311)">3.0</text>
|
| 4383 |
</g>
|
| 4384 |
</g>
|
| 4385 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4387 |
</g>
|
| 4388 |
</g>
|
| 4389 |
<g id="series--torch-layer-norm" class="series">
|
| 4390 |
+
<path d="M 83.741924 437.689571 L 323.888085 303.46214 L 564.034245 314.281776 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4391 |
<defs>
|
| 4392 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4393 |
</defs>
|
| 4394 |
<g clip-path="url(#p2214f54723)">
|
| 4395 |
<use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4396 |
+
<use ns4:href="#md7efaf3aec" x="323.888085" y="303.46214" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4397 |
+
<use ns4:href="#md7efaf3aec" x="564.034245" y="314.281776" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4398 |
<use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4399 |
</g>
|
| 4400 |
</g>
|
| 4401 |
<g id="series--hf-kernels-layer-norm" class="series">
|
| 4402 |
+
<path d="M 83.741924 435.38374 L 323.888085 308.428798 L 564.034245 308.2251 L 804.180406 56.574866 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
|
| 4403 |
<defs>
|
| 4404 |
<path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
|
| 4405 |
</defs>
|
| 4406 |
<g clip-path="url(#p2214f54723)">
|
| 4407 |
+
<use ns4:href="#m9b8c54d372" x="83.741924" y="435.38374" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4408 |
+
<use ns4:href="#m9b8c54d372" x="323.888085" y="308.428798" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4409 |
+
<use ns4:href="#m9b8c54d372" x="564.034245" y="308.2251" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4410 |
+
<use ns4:href="#m9b8c54d372" x="804.180406" y="56.574866" style="fill: #ff7f0e; stroke: #ff7f0e" />
|
| 4411 |
</g>
|
| 4412 |
</g>
|
| 4413 |
<g id="patch_3">
|
rotary/impls/artifacts/benchmark/rotary.jsonl
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
-
{"ts": "2025-10-
|
| 2 |
-
{"ts": "2025-10-
|
| 3 |
-
{"ts": "2025-10-
|
| 4 |
-
{"ts": "2025-10-
|
| 5 |
-
{"ts": "2025-10-
|
| 6 |
-
{"ts": "2025-10-
|
| 7 |
-
{"ts": "2025-10-
|
| 8 |
-
{"ts": "2025-10-
|
| 9 |
-
{"ts": "2025-10-
|
| 10 |
-
{"ts": "2025-10-
|
| 11 |
-
{"ts": "2025-10-
|
| 12 |
-
{"ts": "2025-10-
|
| 13 |
-
{"ts": "2025-10-
|
| 14 |
-
{"ts": "2025-10-
|
| 15 |
-
{"ts": "2025-10-
|
| 16 |
-
{"ts": "2025-10-
|
| 17 |
-
{"ts": "2025-10-
|
| 18 |
-
{"ts": "2025-10-
|
| 19 |
-
{"ts": "2025-10-
|
| 20 |
-
{"ts": "2025-10-
|
| 21 |
-
{"ts": "2025-10-
|
| 22 |
-
{"ts": "2025-10-
|
| 23 |
-
{"ts": "2025-10-
|
| 24 |
-
{"ts": "2025-10-
|
|
|
|
| 1 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17578399996409644, "p50": 0.17709399998011577, "p90": 0.17922400002134964, "mean": 0.17895179998959065, "iqr": 0.002560000041285093, "raw_times": [0.17666399998006455, 0.17709399998011577, 0.17578399996409644, 0.18599300000232688, 0.17922400002134964], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18588400001817718, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21852399999033878, "p50": 0.22123499996951068, "p90": 0.22281499997234278, "mean": 0.22667299998602175, "iqr": 0.0019999999949504854, "raw_times": [0.2208149999773923, 0.22123499996951068, 0.21852399999033878, 0.24997600002052422, 0.22281499997234278], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22185399996033084, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21726399995714019, "p50": 0.22378500000286294, "p90": 0.22635499999523745, "mean": 0.22464679999529835, "iqr": 0.0036000000136482413, "raw_times": [0.22635499999523745, 0.22378500000286294, 0.21726399995714019, 0.23307500003966197, 0.2227549999815892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22023499997203544, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21678499996369283, "p50": 0.2199049999944691, "p90": 0.22050500001569162, "mean": 0.21960279999575505, "iqr": 0.0022199999989425123, "raw_times": [0.22050500001569162, 0.2199049999944691, 0.2182850000167491, 0.22253399998817258, 0.21678499996369283], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23155500002758345, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21946399999706045, "p50": 0.22019400000772293, "p90": 0.22058499996546743, "mean": 0.2201885999852493, "iqr": 0.0005499999815583578, "raw_times": [0.22019400000772293, 0.22003499998390907, 0.22058499996546743, 0.21946399999706045, 0.22066499997208666], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22136500001579407, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.215785000023061, "p50": 0.21925499999042586, "p90": 0.22044500002493805, "mean": 0.22373300000708696, "iqr": 0.003450000008342613, "raw_times": [0.215785000023061, 0.22044500002493805, 0.21925499999042586, 0.24618499998041443, 0.21699500001659544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22207500001059088, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 7 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21516500004281625, "p50": 0.2195149999693058, "p90": 0.22164500001053966, "mean": 0.21926680001342902, "iqr": 0.005660999988776894, "raw_times": [0.22402500002272063, 0.2195149999693058, 0.21516500004281625, 0.21598400002176277, 0.22164500001053966], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21819500000219705, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 8 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2141339999752745, "p50": 0.218735000032666, "p90": 0.21932399999968766, "mean": 0.22093040000754627, "iqr": 0.0017599999750927964, "raw_times": [0.2141339999752745, 0.23489500000550834, 0.21756400002459486, 0.218735000032666, 0.21932399999968766], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22086500001705645, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 9 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21650500002579065, "p50": 0.21868400000357724, "p90": 0.21925499999042586, "mean": 0.22372079999968264, "iqr": 0.0009299999987888441, "raw_times": [0.21650500002579065, 0.24583499998698244, 0.21925499999042586, 0.21832499999163701, 0.21868400000357724], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22242500000402288, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 10 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2184950000128083, "p50": 0.22323500002130459, "p90": 0.22841400004836032, "mean": 0.22448680001616594, "iqr": 0.008849000039390376, "raw_times": [0.22841400004836032, 0.22323500002130459, 0.2184950000128083, 0.21956500000896995, 0.23272499998938656], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22242500000402288, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 11 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21709500003908033, "p50": 0.22114500001180204, "p90": 0.22174499997618113, "mean": 0.22064500001306442, "iqr": 0.004549999971459329, "raw_times": [0.22174499997618113, 0.22114500001180204, 0.22604500003353678, 0.21709500003908033, 0.2171950000047218], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2633260000379778, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 12 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2232749999961925, "p50": 0.22420499999498134, "p90": 0.225494999995135, "mean": 0.22680499999978565, "iqr": 0.001499999996212864, "raw_times": [0.23705500001369728, 0.22399499999892214, 0.225494999995135, 0.22420499999498134, 0.2232749999961925], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22469399999636153, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 13 |
+
{"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21559499998602405, "p50": 0.2172250000285203, "p90": 0.21947499999441789, "mean": 0.2188926000030733, "iqr": 0.002500999983112706, "raw_times": [0.21559499998602405, 0.22519399999509915, 0.21697400001130518, 0.21947499999441789, 0.2172250000285203], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2516950000313045, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 14 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21588499998870248, "p50": 0.2187050000088675, "p90": 0.2197649999970963, "mean": 0.22497519998978532, "iqr": 0.0018900000213761814, "raw_times": [0.21787499997572013, 0.2526459999785402, 0.21588499998870248, 0.2187050000088675, 0.2197649999970963], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22343400002000635, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 15 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21829500002468194, "p50": 0.2230250000252454, "p90": 0.2236250000464679, "mean": 0.22705700001779405, "iqr": 0.0044400000547284435, "raw_times": [0.21829500002468194, 0.25115500000083557, 0.21918499999173946, 0.2230250000252454, 0.2236250000464679], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22709500001383276, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 16 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2149849999568687, "p50": 0.21960500004070127, "p90": 0.22131500003297333, "mean": 0.22512300001835683, "iqr": 0.0024400000029345392, "raw_times": [0.21960500004070127, 0.22131500003297333, 0.2149849999568687, 0.2188750000300388, 0.25083500003120207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22256500000139567, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 17 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21324499999764157, "p50": 0.21695399999543952, "p90": 0.22048499999982596, "mean": 0.21762459998626582, "iqr": 0.003631000026871334, "raw_times": [0.21685399997295463, 0.22058499996546743, 0.21695399999543952, 0.22048499999982596, 0.21324499999764157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2328750000515356, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 18 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21507499997142077, "p50": 0.21520500001770415, "p90": 0.21626500000593296, "mean": 0.2183889999969324, "iqr": 0.001121000025250396, "raw_times": [0.21626500000593296, 0.21507499997142077, 0.21520500001770415, 0.23025600000892155, 0.21514399998068257], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22121499995364502, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 19 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143149999938032, "p50": 0.21809500003655558, "p90": 0.21866499997713618, "mean": 0.2174867999997332, "iqr": 0.0025609999738662736, "raw_times": [0.2161040000032699, 0.2202549999879011, 0.21866499997713618, 0.2143149999938032, 0.21809500003655558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23317500000530345, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 20 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21576399996092732, "p50": 0.21857400003000294, "p90": 0.2222250000158965, "mean": 0.22089439999035676, "iqr": 0.004881000052137097, "raw_times": [0.21576399996092732, 0.23056499998119762, 0.21857400003000294, 0.2222250000158965, 0.21734399996375942], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22117399998933251, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 21 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.213884999993752, "p50": 0.21784400001934046, "p90": 0.21903500004327725, "mean": 0.2176128000087374, "iqr": 0.001270000041131425, "raw_times": [0.213884999993752, 0.21784400001934046, 0.21776500000214583, 0.21953499998517145, 0.21903500004327725], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22340499998563246, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 22 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21709399999281231, "p50": 0.2186049999863826, "p90": 0.21865499996920335, "mean": 0.22543699998323063, "iqr": 0.0004899999908047903, "raw_times": [0.21816499997839855, 0.25466599998935635, 0.21709399999281231, 0.2186049999863826, 0.21865499996920335], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22284399994987325, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 23 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2291549999995368, "p50": 0.23022499999569845, "p90": 0.2316450000421355, "mean": 0.234377000015229, "iqr": 0.0017800000478018774, "raw_times": [0.2291549999995368, 0.2316450000421355, 0.23022499999569845, 0.25099500004444053, 0.22986499999433363], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23142500003814348, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
| 24 |
+
{"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6352529999844592, "p50": 0.6405139999969833, "p90": 0.6429430000025604, "mean": 0.6394775999979174, "iqr": 0.007369000002199755, "raw_times": [0.6405139999969833, 0.6352529999844592, 0.6431040000052235, 0.6355740000003607, 0.6429430000025604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6388340000285098, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
|
rotary/impls/cells/benchmark.py
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
| 7 |
-
# "kernels",
|
| 8 |
# ]
|
| 9 |
#
|
| 10 |
# [tool.uv.sources]
|
|
@@ -13,35 +12,46 @@
|
|
| 13 |
import torch
|
| 14 |
import sys
|
| 15 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
| 16 |
-
from kernels import get_kernel
|
| 17 |
|
| 18 |
-
# Load the rotary kernel
|
| 19 |
-
rotary = get_kernel("kernels-community/rotary")
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
|
|
|
| 23 |
rotary_dim = cos.shape[-1]
|
| 24 |
|
| 25 |
-
# Clone to avoid modifying
|
| 26 |
q_out = query.clone()
|
| 27 |
k_out = key.clone()
|
| 28 |
|
| 29 |
# Apply rotation to query
|
| 30 |
q1 = q_out[..., :rotary_dim]
|
| 31 |
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Apply rotation to key
|
| 35 |
k1 = k_out[..., :rotary_dim]
|
| 36 |
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
|
| 39 |
return q_out, k_out
|
| 40 |
|
| 41 |
|
| 42 |
run_benchmark(
|
| 43 |
kernel_type=KernelTypeEnum.ROTARY,
|
| 44 |
-
impl_name="
|
| 45 |
-
impl_tags={"family": "
|
| 46 |
-
impl_func=
|
| 47 |
)
|
|
|
|
| 4 |
# "numpy",
|
| 5 |
# "torch==2.8.0",
|
| 6 |
# "kernels-benchmark-tools",
|
|
|
|
| 7 |
# ]
|
| 8 |
#
|
| 9 |
# [tool.uv.sources]
|
|
|
|
| 12 |
import torch
|
| 13 |
import sys
|
| 14 |
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
|
|
|
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def apply_rotary_torch(x1, x2, cos, sin, conj=False):
|
| 18 |
+
"""Reference rotary implementation."""
|
| 19 |
+
if not conj:
|
| 20 |
+
out1 = x1 * cos - x2 * sin
|
| 21 |
+
out2 = x1 * sin + x2 * cos
|
| 22 |
+
else:
|
| 23 |
+
out1 = x1 * cos + x2 * sin
|
| 24 |
+
out2 = -x1 * sin + x2 * cos
|
| 25 |
+
return out1, out2
|
| 26 |
|
| 27 |
+
|
| 28 |
+
def torch_rotary(query, key, cos, sin, conj=False):
|
| 29 |
rotary_dim = cos.shape[-1]
|
| 30 |
|
| 31 |
+
# Clone inputs to avoid modifying them
|
| 32 |
q_out = query.clone()
|
| 33 |
k_out = key.clone()
|
| 34 |
|
| 35 |
# Apply rotation to query
|
| 36 |
q1 = q_out[..., :rotary_dim]
|
| 37 |
q2 = q_out[..., rotary_dim : 2 * rotary_dim]
|
| 38 |
+
q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
|
| 39 |
+
q_out[..., :rotary_dim] = q_out_1
|
| 40 |
+
q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
|
| 41 |
|
| 42 |
# Apply rotation to key
|
| 43 |
k1 = k_out[..., :rotary_dim]
|
| 44 |
k2 = k_out[..., rotary_dim : 2 * rotary_dim]
|
| 45 |
+
k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
|
| 46 |
+
k_out[..., :rotary_dim] = k_out_1
|
| 47 |
+
k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
|
| 48 |
|
| 49 |
return q_out, k_out
|
| 50 |
|
| 51 |
|
| 52 |
run_benchmark(
|
| 53 |
kernel_type=KernelTypeEnum.ROTARY,
|
| 54 |
+
impl_name="torch_eager",
|
| 55 |
+
impl_tags={"family": "pytorch", "backend": "eager"},
|
| 56 |
+
impl_func=torch_rotary,
|
| 57 |
)
|
rotary/impls/hf_kernels_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/impls/torch_rotary.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
rotary/results/artifacts/combine/latency.svg
CHANGED
|
|
|
|
rotary/results/combined_results.html
CHANGED
|
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
-
<dc:date>2025-10-
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
-
<path d="M 47.72
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
-
<path d="M 47.72
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
-
<path d="M 47.72
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
-
<path d="M 47.72
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
-
<path d="M 47.72
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
@@ -4287,34 +4287,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--torch-eager" class="series">
|
| 4290 |
-
<path d="M 82.966497 405.060892 L 113.615625
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#p088c925177)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
-
<use ns4:href="#md7efaf3aec" x="113.615625" y="
|
| 4297 |
-
<use ns4:href="#md7efaf3aec" x="144.264753" y="
|
| 4298 |
-
<use ns4:href="#md7efaf3aec" x="174.913881" y="
|
| 4299 |
-
<use ns4:href="#md7efaf3aec" x="205.563009" y="
|
| 4300 |
-
<use ns4:href="#md7efaf3aec" x="236.212137" y="
|
| 4301 |
-
<use ns4:href="#md7efaf3aec" x="266.861265" y="
|
| 4302 |
-
<use ns4:href="#md7efaf3aec" x="297.510393" y="
|
| 4303 |
-
<use ns4:href="#md7efaf3aec" x="328.159521" y="
|
| 4304 |
-
<use ns4:href="#md7efaf3aec" x="358.808648" y="
|
| 4305 |
-
<use ns4:href="#md7efaf3aec" x="389.457776" y="
|
| 4306 |
-
<use ns4:href="#md7efaf3aec" x="420.106904" y="
|
| 4307 |
-
<use ns4:href="#md7efaf3aec" x="450.756032" y="
|
| 4308 |
-
<use ns4:href="#md7efaf3aec" x="481.40516" y="
|
| 4309 |
-
<use ns4:href="#md7efaf3aec" x="512.054288" y="
|
| 4310 |
-
<use ns4:href="#md7efaf3aec" x="542.703416" y="
|
| 4311 |
-
<use ns4:href="#md7efaf3aec" x="573.352544" y="
|
| 4312 |
-
<use ns4:href="#md7efaf3aec" x="604.001672" y="
|
| 4313 |
-
<use ns4:href="#md7efaf3aec" x="634.6508" y="
|
| 4314 |
-
<use ns4:href="#md7efaf3aec" x="665.299928" y="
|
| 4315 |
-
<use ns4:href="#md7efaf3aec" x="695.949056" y="
|
| 4316 |
-
<use ns4:href="#md7efaf3aec" x="726.598184" y="
|
| 4317 |
-
<use ns4:href="#md7efaf3aec" x="757.247312" y="
|
| 4318 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
</g>
|
| 4320 |
</g>
|
|
@@ -4364,7 +4364,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 4364 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4365 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4366 |
</span> |
|
| 4367 |
-
Cell: combine | 4.
|
| 4368 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4369 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4370 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4451,7 +4451,7 @@ Summary: 2 found, 0 skipped, 0 missing
|
|
| 4451 |
COMBINED BENCHMARK SUMMARY
|
| 4452 |
|
| 4453 |
impl wl p50(ms) ok
|
| 4454 |
-
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.
|
| 4455 |
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.10 False
|
| 4456 |
hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
|
| 4457 |
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
|
|
@@ -4462,7 +4462,7 @@ hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 False
|
|
| 4462 |
hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False
|
| 4463 |
hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False
|
| 4464 |
hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False
|
| 4465 |
-
hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.
|
| 4466 |
hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False
|
| 4467 |
hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False
|
| 4468 |
hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False
|
|
@@ -4470,35 +4470,35 @@ hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 False
|
|
| 4470 |
hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False
|
| 4471 |
hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False
|
| 4472 |
hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False
|
| 4473 |
-
hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.
|
| 4474 |
-
hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.
|
| 4475 |
-
hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.
|
| 4476 |
hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False
|
| 4477 |
hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
|
| 4478 |
-
torch_eager cuda_B1_S128_H32_D128_R64 0.
|
| 4479 |
-
torch_eager cuda_B1_S128_H32_D64_R32 0.
|
| 4480 |
-
torch_eager cuda_B1_S128_H8_D128_R64 0.
|
| 4481 |
torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
|
| 4482 |
-
torch_eager cuda_B1_S2048_H32_D128_R64 0.
|
| 4483 |
-
torch_eager cuda_B1_S2048_H32_D64_R32 0.
|
| 4484 |
-
torch_eager cuda_B1_S2048_H8_D128_R64 0.
|
| 4485 |
-
torch_eager cuda_B1_S2048_H8_D64_R32 0.
|
| 4486 |
-
torch_eager cuda_B1_S512_H32_D128_R64 0.
|
| 4487 |
-
torch_eager cuda_B1_S512_H32_D64_R32 0.
|
| 4488 |
-
torch_eager cuda_B1_S512_H8_D128_R64 0.
|
| 4489 |
-
torch_eager cuda_B1_S512_H8_D64_R32 0.
|
| 4490 |
-
torch_eager cuda_B2_S128_H32_D128_R64 0.
|
| 4491 |
-
torch_eager cuda_B2_S128_H32_D64_R32 0.
|
| 4492 |
-
torch_eager cuda_B2_S128_H8_D128_R64 0.
|
| 4493 |
-
torch_eager cuda_B2_S128_H8_D64_R32 0.
|
| 4494 |
torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
|
| 4495 |
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
|
| 4496 |
-
torch_eager cuda_B2_S2048_H8_D128_R64 0.
|
| 4497 |
-
torch_eager cuda_B2_S2048_H8_D64_R32 0.
|
| 4498 |
-
torch_eager cuda_B2_S512_H32_D128_R64 0.
|
| 4499 |
-
torch_eager cuda_B2_S512_H32_D64_R32 0.
|
| 4500 |
-
torch_eager cuda_B2_S512_H8_D128_R64 0.
|
| 4501 |
-
torch_eager cuda_B2_S512_H8_D64_R32 0.
|
| 4502 |
|
| 4503 |
GENERATING COMBINED VISUALIZATION
|
| 4504 |
|
|
@@ -4518,7 +4518,7 @@ Implementations included:
|
|
| 4518 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4519 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4520 |
<div class="uv-logs-content" style="display: none;">
|
| 4521 |
-
Installed 37 packages in
|
| 4522 |
</div>
|
| 4523 |
</div>
|
| 4524 |
<div class="cell-artifacts">
|
|
@@ -4531,7 +4531,7 @@ Installed 37 packages in 224ms
|
|
| 4531 |
<rdf:RDF>
|
| 4532 |
<ns2:Work>
|
| 4533 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4534 |
-
<dc:date>2025-10-
|
| 4535 |
<dc:format>image/svg+xml</dc:format>
|
| 4536 |
<dc:creator>
|
| 4537 |
<ns2:Agent>
|
|
@@ -4875,70 +4875,70 @@ Installed 37 packages in 224ms
|
|
| 4875 |
<g id="matplotlib.axis_2">
|
| 4876 |
<g id="ytick_1">
|
| 4877 |
<g id="grid-y--2" class="grid grid-y">
|
| 4878 |
-
<path d="M 47.72
|
| 4879 |
</g>
|
| 4880 |
<g id="line2d_25">
|
| 4881 |
<defs>
|
| 4882 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4883 |
</defs>
|
| 4884 |
<g>
|
| 4885 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4886 |
</g>
|
| 4887 |
</g>
|
| 4888 |
<g id="text_25">
|
| 4889 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4890 |
</g>
|
| 4891 |
</g>
|
| 4892 |
<g id="ytick_2">
|
| 4893 |
<g id="grid-y--3" class="grid grid-y">
|
| 4894 |
-
<path d="M 47.72
|
| 4895 |
</g>
|
| 4896 |
<g id="line2d_26">
|
| 4897 |
<g>
|
| 4898 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4899 |
</g>
|
| 4900 |
</g>
|
| 4901 |
<g id="text_26">
|
| 4902 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4903 |
</g>
|
| 4904 |
</g>
|
| 4905 |
<g id="ytick_3">
|
| 4906 |
<g id="grid-y--4" class="grid grid-y">
|
| 4907 |
-
<path d="M 47.72
|
| 4908 |
</g>
|
| 4909 |
<g id="line2d_27">
|
| 4910 |
<g>
|
| 4911 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4912 |
</g>
|
| 4913 |
</g>
|
| 4914 |
<g id="text_27">
|
| 4915 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4916 |
</g>
|
| 4917 |
</g>
|
| 4918 |
<g id="ytick_4">
|
| 4919 |
<g id="grid-y--5" class="grid grid-y">
|
| 4920 |
-
<path d="M 47.72
|
| 4921 |
</g>
|
| 4922 |
<g id="line2d_28">
|
| 4923 |
<g>
|
| 4924 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4925 |
</g>
|
| 4926 |
</g>
|
| 4927 |
<g id="text_28">
|
| 4928 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4929 |
</g>
|
| 4930 |
</g>
|
| 4931 |
<g id="ytick_5">
|
| 4932 |
<g id="grid-y--6" class="grid grid-y">
|
| 4933 |
-
<path d="M 47.72
|
| 4934 |
</g>
|
| 4935 |
<g id="line2d_29">
|
| 4936 |
<g>
|
| 4937 |
-
<use ns4:href="#m0fca2865ba" x="47.72" y="
|
| 4938 |
</g>
|
| 4939 |
</g>
|
| 4940 |
<g id="text_29">
|
| 4941 |
-
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="
|
| 4942 |
</g>
|
| 4943 |
</g>
|
| 4944 |
<g id="label--y" class="ylabel">
|
|
@@ -4946,34 +4946,34 @@ Installed 37 packages in 224ms
|
|
| 4946 |
</g>
|
| 4947 |
</g>
|
| 4948 |
<g id="series--torch-eager" class="series">
|
| 4949 |
-
<path d="M 82.966497 405.060892 L 113.615625
|
| 4950 |
<defs>
|
| 4951 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4952 |
</defs>
|
| 4953 |
<g clip-path="url(#p088c925177)">
|
| 4954 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4955 |
-
<use ns4:href="#md7efaf3aec" x="113.615625" y="
|
| 4956 |
-
<use ns4:href="#md7efaf3aec" x="144.264753" y="
|
| 4957 |
-
<use ns4:href="#md7efaf3aec" x="174.913881" y="
|
| 4958 |
-
<use ns4:href="#md7efaf3aec" x="205.563009" y="
|
| 4959 |
-
<use ns4:href="#md7efaf3aec" x="236.212137" y="
|
| 4960 |
-
<use ns4:href="#md7efaf3aec" x="266.861265" y="
|
| 4961 |
-
<use ns4:href="#md7efaf3aec" x="297.510393" y="
|
| 4962 |
-
<use ns4:href="#md7efaf3aec" x="328.159521" y="
|
| 4963 |
-
<use ns4:href="#md7efaf3aec" x="358.808648" y="
|
| 4964 |
-
<use ns4:href="#md7efaf3aec" x="389.457776" y="
|
| 4965 |
-
<use ns4:href="#md7efaf3aec" x="420.106904" y="
|
| 4966 |
-
<use ns4:href="#md7efaf3aec" x="450.756032" y="
|
| 4967 |
-
<use ns4:href="#md7efaf3aec" x="481.40516" y="
|
| 4968 |
-
<use ns4:href="#md7efaf3aec" x="512.054288" y="
|
| 4969 |
-
<use ns4:href="#md7efaf3aec" x="542.703416" y="
|
| 4970 |
-
<use ns4:href="#md7efaf3aec" x="573.352544" y="
|
| 4971 |
-
<use ns4:href="#md7efaf3aec" x="604.001672" y="
|
| 4972 |
-
<use ns4:href="#md7efaf3aec" x="634.6508" y="
|
| 4973 |
-
<use ns4:href="#md7efaf3aec" x="665.299928" y="
|
| 4974 |
-
<use ns4:href="#md7efaf3aec" x="695.949056" y="
|
| 4975 |
-
<use ns4:href="#md7efaf3aec" x="726.598184" y="
|
| 4976 |
-
<use ns4:href="#md7efaf3aec" x="757.247312" y="
|
| 4977 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4978 |
</g>
|
| 4979 |
</g>
|
|
|
|
| 3872 |
<rdf:RDF>
|
| 3873 |
<ns2:Work>
|
| 3874 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 3875 |
+
<dc:date>2025-10-29T04:15:02.721683</dc:date>
|
| 3876 |
<dc:format>image/svg+xml</dc:format>
|
| 3877 |
<dc:creator>
|
| 3878 |
<ns2:Agent>
|
|
|
|
| 4216 |
<g id="matplotlib.axis_2">
|
| 4217 |
<g id="ytick_1">
|
| 4218 |
<g id="grid-y--2" class="grid grid-y">
|
| 4219 |
+
<path d="M 47.72 387.258238 L 823.142937 387.258238 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4220 |
</g>
|
| 4221 |
<g id="line2d_25">
|
| 4222 |
<defs>
|
| 4223 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4224 |
</defs>
|
| 4225 |
<g>
|
| 4226 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="387.258238" style="stroke: #000000; stroke-width: 0.8" />
|
| 4227 |
</g>
|
| 4228 |
</g>
|
| 4229 |
<g id="text_25">
|
| 4230 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="391.057456" transform="rotate(-0 40.72 391.057456)">0.2</text>
|
| 4231 |
</g>
|
| 4232 |
</g>
|
| 4233 |
<g id="ytick_2">
|
| 4234 |
<g id="grid-y--3" class="grid grid-y">
|
| 4235 |
+
<path d="M 47.72 309.537751 L 823.142937 309.537751 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4236 |
</g>
|
| 4237 |
<g id="line2d_26">
|
| 4238 |
<g>
|
| 4239 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="309.537751" style="stroke: #000000; stroke-width: 0.8" />
|
| 4240 |
</g>
|
| 4241 |
</g>
|
| 4242 |
<g id="text_26">
|
| 4243 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="313.33697" transform="rotate(-0 40.72 313.33697)">0.3</text>
|
| 4244 |
</g>
|
| 4245 |
</g>
|
| 4246 |
<g id="ytick_3">
|
| 4247 |
<g id="grid-y--4" class="grid grid-y">
|
| 4248 |
+
<path d="M 47.72 231.817265 L 823.142937 231.817265 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4249 |
</g>
|
| 4250 |
<g id="line2d_27">
|
| 4251 |
<g>
|
| 4252 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="231.817265" style="stroke: #000000; stroke-width: 0.8" />
|
| 4253 |
</g>
|
| 4254 |
</g>
|
| 4255 |
<g id="text_27">
|
| 4256 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="235.616483" transform="rotate(-0 40.72 235.616483)">0.4</text>
|
| 4257 |
</g>
|
| 4258 |
</g>
|
| 4259 |
<g id="ytick_4">
|
| 4260 |
<g id="grid-y--5" class="grid grid-y">
|
| 4261 |
+
<path d="M 47.72 154.096778 L 823.142937 154.096778 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4262 |
</g>
|
| 4263 |
<g id="line2d_28">
|
| 4264 |
<g>
|
| 4265 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="154.096778" style="stroke: #000000; stroke-width: 0.8" />
|
| 4266 |
</g>
|
| 4267 |
</g>
|
| 4268 |
<g id="text_28">
|
| 4269 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="157.895997" transform="rotate(-0 40.72 157.895997)">0.5</text>
|
| 4270 |
</g>
|
| 4271 |
</g>
|
| 4272 |
<g id="ytick_5">
|
| 4273 |
<g id="grid-y--6" class="grid grid-y">
|
| 4274 |
+
<path d="M 47.72 76.376292 L 823.142937 76.376292 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4275 |
</g>
|
| 4276 |
<g id="line2d_29">
|
| 4277 |
<g>
|
| 4278 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="76.376292" style="stroke: #000000; stroke-width: 0.8" />
|
| 4279 |
</g>
|
| 4280 |
</g>
|
| 4281 |
<g id="text_29">
|
| 4282 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="80.175511" transform="rotate(-0 40.72 80.175511)">0.6</text>
|
| 4283 |
</g>
|
| 4284 |
</g>
|
| 4285 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4287 |
</g>
|
| 4288 |
</g>
|
| 4289 |
<g id="series--torch-eager" class="series">
|
| 4290 |
+
<path d="M 82.966497 405.060892 L 113.615625 370.754292 L 144.264753 368.77242 L 174.913881 371.787975 L 205.563009 371.563363 L 236.212137 372.293158 L 266.861265 372.091085 L 297.510393 372.697305 L 328.159521 372.736942 L 358.808648 369.199883 L 389.457776 370.824241 L 420.106904 368.445994 L 450.756032 373.870884 L 481.40516 372.720621 L 512.054288 369.363096 L 542.703416 372.021136 L 573.352544 374.081506 L 604.001672 375.440838 L 634.6508 373.194716 L 665.299928 372.822434 L 695.949056 373.389794 L 726.598184 372.798341 L 757.247312 363.767221 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4291 |
<defs>
|
| 4292 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4293 |
</defs>
|
| 4294 |
<g clip-path="url(#p088c925177)">
|
| 4295 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4296 |
+
<use ns4:href="#md7efaf3aec" x="113.615625" y="370.754292" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4297 |
+
<use ns4:href="#md7efaf3aec" x="144.264753" y="368.77242" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4298 |
+
<use ns4:href="#md7efaf3aec" x="174.913881" y="371.787975" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4299 |
+
<use ns4:href="#md7efaf3aec" x="205.563009" y="371.563363" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4300 |
+
<use ns4:href="#md7efaf3aec" x="236.212137" y="372.293158" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4301 |
+
<use ns4:href="#md7efaf3aec" x="266.861265" y="372.091085" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4302 |
+
<use ns4:href="#md7efaf3aec" x="297.510393" y="372.697305" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4303 |
+
<use ns4:href="#md7efaf3aec" x="328.159521" y="372.736942" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4304 |
+
<use ns4:href="#md7efaf3aec" x="358.808648" y="369.199883" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4305 |
+
<use ns4:href="#md7efaf3aec" x="389.457776" y="370.824241" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4306 |
+
<use ns4:href="#md7efaf3aec" x="420.106904" y="368.445994" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4307 |
+
<use ns4:href="#md7efaf3aec" x="450.756032" y="373.870884" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4308 |
+
<use ns4:href="#md7efaf3aec" x="481.40516" y="372.720621" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4309 |
+
<use ns4:href="#md7efaf3aec" x="512.054288" y="369.363096" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4310 |
+
<use ns4:href="#md7efaf3aec" x="542.703416" y="372.021136" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4311 |
+
<use ns4:href="#md7efaf3aec" x="573.352544" y="374.081506" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4312 |
+
<use ns4:href="#md7efaf3aec" x="604.001672" y="375.440838" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4313 |
+
<use ns4:href="#md7efaf3aec" x="634.6508" y="373.194716" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4314 |
+
<use ns4:href="#md7efaf3aec" x="665.299928" y="372.822434" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4315 |
+
<use ns4:href="#md7efaf3aec" x="695.949056" y="373.389794" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4316 |
+
<use ns4:href="#md7efaf3aec" x="726.598184" y="372.798341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4317 |
+
<use ns4:href="#md7efaf3aec" x="757.247312" y="363.767221" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4318 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4319 |
</g>
|
| 4320 |
</g>
|
|
|
|
| 4364 |
<span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
|
| 4365 |
<span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4366 |
</span> |
|
| 4367 |
+
Cell: combine | 4.35s
|
| 4368 |
| <button class="run-btn" onclick="runCell('combine')">▶ run</button>
|
| 4369 |
<button class="copy-btn" onclick="copyCell('combine')">Copy</button>
|
| 4370 |
<a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4451 |
COMBINED BENCHMARK SUMMARY
|
| 4452 |
|
| 4453 |
impl wl p50(ms) ok
|
| 4454 |
+
hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.10 False
|
| 4455 |
hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.10 False
|
| 4456 |
hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
|
| 4457 |
hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
|
|
|
|
| 4462 |
hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False
|
| 4463 |
hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False
|
| 4464 |
hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False
|
| 4465 |
+
hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.10 False
|
| 4466 |
hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False
|
| 4467 |
hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False
|
| 4468 |
hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False
|
|
|
|
| 4470 |
hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False
|
| 4471 |
hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False
|
| 4472 |
hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False
|
| 4473 |
+
hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.10 False
|
| 4474 |
+
hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.10 False
|
| 4475 |
+
hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 False
|
| 4476 |
hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False
|
| 4477 |
hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
|
| 4478 |
+
torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
|
| 4479 |
+
torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
|
| 4480 |
+
torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
|
| 4481 |
torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
|
| 4482 |
+
torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
|
| 4483 |
+
torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
|
| 4484 |
+
torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
|
| 4485 |
+
torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
|
| 4486 |
+
torch_eager cuda_B1_S512_H32_D128_R64 0.22 True
|
| 4487 |
+
torch_eager cuda_B1_S512_H32_D64_R32 0.22 True
|
| 4488 |
+
torch_eager cuda_B1_S512_H8_D128_R64 0.22 True
|
| 4489 |
+
torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
|
| 4490 |
+
torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
|
| 4491 |
+
torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
|
| 4492 |
+
torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
|
| 4493 |
+
torch_eager cuda_B2_S128_H8_D64_R32 0.22 True
|
| 4494 |
torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
|
| 4495 |
torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
|
| 4496 |
+
torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
|
| 4497 |
+
torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
|
| 4498 |
+
torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
|
| 4499 |
+
torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
|
| 4500 |
+
torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
|
| 4501 |
+
torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
|
| 4502 |
|
| 4503 |
GENERATING COMBINED VISUALIZATION
|
| 4504 |
|
|
|
|
| 4518 |
<div class="uv-install-logs" id="uv-logs-combine">
|
| 4519 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4520 |
<div class="uv-logs-content" style="display: none;">
|
| 4521 |
+
Installed 37 packages in 196ms
|
| 4522 |
</div>
|
| 4523 |
</div>
|
| 4524 |
<div class="cell-artifacts">
|
|
|
|
| 4531 |
<rdf:RDF>
|
| 4532 |
<ns2:Work>
|
| 4533 |
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
| 4534 |
+
<dc:date>2025-10-29T04:15:02.721683</dc:date>
|
| 4535 |
<dc:format>image/svg+xml</dc:format>
|
| 4536 |
<dc:creator>
|
| 4537 |
<ns2:Agent>
|
|
|
|
| 4875 |
<g id="matplotlib.axis_2">
|
| 4876 |
<g id="ytick_1">
|
| 4877 |
<g id="grid-y--2" class="grid grid-y">
|
| 4878 |
+
<path d="M 47.72 387.258238 L 823.142937 387.258238 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4879 |
</g>
|
| 4880 |
<g id="line2d_25">
|
| 4881 |
<defs>
|
| 4882 |
<path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
|
| 4883 |
</defs>
|
| 4884 |
<g>
|
| 4885 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="387.258238" style="stroke: #000000; stroke-width: 0.8" />
|
| 4886 |
</g>
|
| 4887 |
</g>
|
| 4888 |
<g id="text_25">
|
| 4889 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="391.057456" transform="rotate(-0 40.72 391.057456)">0.2</text>
|
| 4890 |
</g>
|
| 4891 |
</g>
|
| 4892 |
<g id="ytick_2">
|
| 4893 |
<g id="grid-y--3" class="grid grid-y">
|
| 4894 |
+
<path d="M 47.72 309.537751 L 823.142937 309.537751 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4895 |
</g>
|
| 4896 |
<g id="line2d_26">
|
| 4897 |
<g>
|
| 4898 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="309.537751" style="stroke: #000000; stroke-width: 0.8" />
|
| 4899 |
</g>
|
| 4900 |
</g>
|
| 4901 |
<g id="text_26">
|
| 4902 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="313.33697" transform="rotate(-0 40.72 313.33697)">0.3</text>
|
| 4903 |
</g>
|
| 4904 |
</g>
|
| 4905 |
<g id="ytick_3">
|
| 4906 |
<g id="grid-y--4" class="grid grid-y">
|
| 4907 |
+
<path d="M 47.72 231.817265 L 823.142937 231.817265 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4908 |
</g>
|
| 4909 |
<g id="line2d_27">
|
| 4910 |
<g>
|
| 4911 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="231.817265" style="stroke: #000000; stroke-width: 0.8" />
|
| 4912 |
</g>
|
| 4913 |
</g>
|
| 4914 |
<g id="text_27">
|
| 4915 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="235.616483" transform="rotate(-0 40.72 235.616483)">0.4</text>
|
| 4916 |
</g>
|
| 4917 |
</g>
|
| 4918 |
<g id="ytick_4">
|
| 4919 |
<g id="grid-y--5" class="grid grid-y">
|
| 4920 |
+
<path d="M 47.72 154.096778 L 823.142937 154.096778 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4921 |
</g>
|
| 4922 |
<g id="line2d_28">
|
| 4923 |
<g>
|
| 4924 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="154.096778" style="stroke: #000000; stroke-width: 0.8" />
|
| 4925 |
</g>
|
| 4926 |
</g>
|
| 4927 |
<g id="text_28">
|
| 4928 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="157.895997" transform="rotate(-0 40.72 157.895997)">0.5</text>
|
| 4929 |
</g>
|
| 4930 |
</g>
|
| 4931 |
<g id="ytick_5">
|
| 4932 |
<g id="grid-y--6" class="grid grid-y">
|
| 4933 |
+
<path d="M 47.72 76.376292 L 823.142937 76.376292 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
|
| 4934 |
</g>
|
| 4935 |
<g id="line2d_29">
|
| 4936 |
<g>
|
| 4937 |
+
<use ns4:href="#m0fca2865ba" x="47.72" y="76.376292" style="stroke: #000000; stroke-width: 0.8" />
|
| 4938 |
</g>
|
| 4939 |
</g>
|
| 4940 |
<g id="text_29">
|
| 4941 |
+
<text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="80.175511" transform="rotate(-0 40.72 80.175511)">0.6</text>
|
| 4942 |
</g>
|
| 4943 |
</g>
|
| 4944 |
<g id="label--y" class="ylabel">
|
|
|
|
| 4946 |
</g>
|
| 4947 |
</g>
|
| 4948 |
<g id="series--torch-eager" class="series">
|
| 4949 |
+
<path d="M 82.966497 405.060892 L 113.615625 370.754292 L 144.264753 368.77242 L 174.913881 371.787975 L 205.563009 371.563363 L 236.212137 372.293158 L 266.861265 372.091085 L 297.510393 372.697305 L 328.159521 372.736942 L 358.808648 369.199883 L 389.457776 370.824241 L 420.106904 368.445994 L 450.756032 373.870884 L 481.40516 372.720621 L 512.054288 369.363096 L 542.703416 372.021136 L 573.352544 374.081506 L 604.001672 375.440838 L 634.6508 373.194716 L 665.299928 372.822434 L 695.949056 373.389794 L 726.598184 372.798341 L 757.247312 363.767221 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
|
| 4950 |
<defs>
|
| 4951 |
<path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
|
| 4952 |
</defs>
|
| 4953 |
<g clip-path="url(#p088c925177)">
|
| 4954 |
<use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4955 |
+
<use ns4:href="#md7efaf3aec" x="113.615625" y="370.754292" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4956 |
+
<use ns4:href="#md7efaf3aec" x="144.264753" y="368.77242" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4957 |
+
<use ns4:href="#md7efaf3aec" x="174.913881" y="371.787975" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4958 |
+
<use ns4:href="#md7efaf3aec" x="205.563009" y="371.563363" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4959 |
+
<use ns4:href="#md7efaf3aec" x="236.212137" y="372.293158" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4960 |
+
<use ns4:href="#md7efaf3aec" x="266.861265" y="372.091085" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4961 |
+
<use ns4:href="#md7efaf3aec" x="297.510393" y="372.697305" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4962 |
+
<use ns4:href="#md7efaf3aec" x="328.159521" y="372.736942" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4963 |
+
<use ns4:href="#md7efaf3aec" x="358.808648" y="369.199883" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4964 |
+
<use ns4:href="#md7efaf3aec" x="389.457776" y="370.824241" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4965 |
+
<use ns4:href="#md7efaf3aec" x="420.106904" y="368.445994" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4966 |
+
<use ns4:href="#md7efaf3aec" x="450.756032" y="373.870884" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4967 |
+
<use ns4:href="#md7efaf3aec" x="481.40516" y="372.720621" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4968 |
+
<use ns4:href="#md7efaf3aec" x="512.054288" y="369.363096" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4969 |
+
<use ns4:href="#md7efaf3aec" x="542.703416" y="372.021136" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4970 |
+
<use ns4:href="#md7efaf3aec" x="573.352544" y="374.081506" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4971 |
+
<use ns4:href="#md7efaf3aec" x="604.001672" y="375.440838" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4972 |
+
<use ns4:href="#md7efaf3aec" x="634.6508" y="373.194716" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4973 |
+
<use ns4:href="#md7efaf3aec" x="665.299928" y="372.822434" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4974 |
+
<use ns4:href="#md7efaf3aec" x="695.949056" y="373.389794" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4975 |
+
<use ns4:href="#md7efaf3aec" x="726.598184" y="372.798341" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4976 |
+
<use ns4:href="#md7efaf3aec" x="757.247312" y="363.767221" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4977 |
<use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
|
| 4978 |
</g>
|
| 4979 |
</g>
|