drbh HF Staff commited on
Commit
b17c1a4
·
verified ·
1 Parent(s): 370f848

Upload folder using huggingface_hub

Browse files
Files changed (33) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +9 -9
  2. activation/impls/cells/benchmark.py +7 -13
  3. activation/impls/hf_kernels_swiglu.html +144 -96
  4. activation/impls/torch_swiglu.html +122 -128
  5. activation/results/artifacts/combine/latency.svg +38 -38
  6. activation/results/combined_results.html +79 -79
  7. causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
  8. causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
  9. causal_conv1d/impls/torch_causal_conv1d.html +0 -0
  10. causal_conv1d/results/artifacts/combine/latency.svg +64 -64
  11. causal_conv1d/results/combined_results.html +142 -142
  12. flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
  13. flash_attn/impls/cells/benchmark.py +8 -9
  14. flash_attn/impls/flash_attention.html +137 -137
  15. flash_attn/impls/hf_kernels_flash_attn.html +92 -92
  16. flash_attn/impls/hf_kernels_flash_attn3.html +84 -89
  17. flash_attn/impls/mem_efficient_attention.html +133 -185
  18. flash_attn/impls/sage_attention.html +17 -12
  19. flash_attn/impls/xformers.html +91 -91
  20. flash_attn/results/artifacts/combine/latency.svg +55 -55
  21. flash_attn/results/combined_results.html +141 -141
  22. layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
  23. layer_norm/impls/cells/benchmark.py +5 -28
  24. layer_norm/impls/hf_kernels_layer_norm.html +54 -55
  25. layer_norm/impls/torch_layer_norm.html +54 -60
  26. layer_norm/results/artifacts/combine/latency.svg +24 -24
  27. layer_norm/results/combined_results.html +53 -53
  28. rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
  29. rotary/impls/cells/benchmark.py +21 -11
  30. rotary/impls/hf_kernels_rotary.html +0 -0
  31. rotary/impls/torch_rotary.html +0 -0
  32. rotary/results/artifacts/combine/latency.svg +39 -39
  33. rotary/results/combined_results.html +106 -106
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.022950000015953265, "p50": 0.023951000002853107, "p90": 0.0245499999778076, "mean": 0.02414040001212925, "iqr": 0.0010899999551838846, "raw_times": [0.02579100004140855, 0.0245499999778076, 0.023951000002853107, 0.022950000015953265, 0.023460000022623717], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031180999997104664, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02659000000448941, "p50": 0.03026100000624865, "p90": 0.03163099995617813, "mean": 0.03016299999671901, "iqr": 0.001709999935428641, "raw_times": [0.02659000000448941, 0.03026100000624865, 0.02992100002074949, 0.03163099995617813, 0.032411999995929364], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03256100001181039, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02795999995441889, "p50": 0.0293610000312583, "p90": 0.02937200002861573, "mean": 0.029306999988421012, "iqr": 9.100006082007894e-05, "raw_times": [0.02795999995441889, 0.03056099996001649, 0.0293610000312583, 0.02928099996779565, 0.02937200002861573], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03265100002636245, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02837199997429707, "p50": 0.029151000035199104, "p90": 0.0292910000325719, "mean": 0.028971200003979902, "iqr": 0.0007500000265281415, "raw_times": [0.02854100000604376, 0.0292910000325719, 0.029500999971787678, 0.029151000035199104, 0.02837199997429707], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03205100000513994, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0284509999914917, "p50": 0.02926099995192999, "p90": 0.029411000014079036, "mean": 0.029144599977826147, "iqr": 0.0005010000450056395, "raw_times": [0.028909999969073397, 0.029689999962556612, 0.029411000014079036, 0.0284509999914917, 0.02926099995192999], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031930999966789386, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.027061000025696558, "p50": 0.028121000013925368, "p90": 0.02836999999544787, "mean": 0.027967000005446607, "iqr": 0.0005990000317979138, "raw_times": [0.027770999963649956, 0.028512000028513285, 0.028121000013925368, 0.02836999999544787, 0.027061000025696558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030291000030047144, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02748099996097153, "p50": 0.029001000029893476, "p90": 0.030041000002256624, "mean": 0.029116999996858794, "iqr": 0.0011299999869152089, "raw_times": [0.02748099996097153, 0.030150999975830928, 0.030041000002256624, 0.029001000029893476, 0.028911000015341415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031200999956126907, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028581000037775084, "p50": 0.028771000017968618, "p90": 0.02886099997567726, "mean": 0.028774800000519463, "iqr": 0.00020999999605919584, "raw_times": [0.028581000037775084, 0.02900999999155829, 0.028771000017968618, 0.028650999979618064, 0.02886099997567726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03162100000508872, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
- {"ts": "2025-10-29T00:36:55Z", "run": "c0d95931b9f04e56b6b3c65d3fccc607", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028431000032469456, "p50": 0.029390999998213374, "p90": 0.029580999978406908, "mean": 0.029274800010625768, "iqr": 0.00035999994452140527, "raw_times": [0.028431000032469456, 0.029221000033885502, 0.0297500000101536, 0.029390999998213374, 0.029580999978406908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030401000003621448, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.039330999982212234, "p50": 0.04005099998494188, "p90": 0.04157099999702041, "mean": 0.040440999998736515, "iqr": 0.0020999999605919584, "raw_times": [0.03947100003642845, 0.04157099999702041, 0.04005099998494188, 0.041780999993079604, 0.039330999982212234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.047832000007019815, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0515919999770631, "p50": 0.05179099997576486, "p90": 0.05224099999168175, "mean": 0.05211119997738933, "iqr": 0.0006300000450210064, "raw_times": [0.05224099999168175, 0.0515919999770631, 0.05161099994666074, 0.05179099997576486, 0.05332099999577622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0555309999867859, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04996100000198567, "p50": 0.05194099998107049, "p90": 0.05195099998900332, "mean": 0.05124099998283782, "iqr": 0.0016000000186977559, "raw_times": [0.05194099998107049, 0.05195099998900332, 0.04996100000198567, 0.05200099997182406, 0.050350999970305566], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05537100003039086, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04920100002436811, "p50": 0.05169100001012339, "p90": 0.05200099997182406, "mean": 0.051318999987870484, "iqr": 0.000339999985499162, "raw_times": [0.051660999986324896, 0.05204099994671196, 0.04920100002436811, 0.05200099997182406, 0.05169100001012339], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055880999980217894, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04919000002701068, "p50": 0.05105200000343757, "p90": 0.05142099996646721, "mean": 0.050994999992326484, "iqr": 0.0005200000146032835, "raw_times": [0.05090099995186392, 0.05241100001285304, 0.05142099996646721, 0.04919000002701068, 0.05105200000343757], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054681999984040885, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04663100003199361, "p50": 0.05066099998884965, "p90": 0.05077099996242396, "mean": 0.049591000004056696, "iqr": 0.0016599999526079046, "raw_times": [0.04911100000981605, 0.05077099996242396, 0.04663100003199361, 0.05078100002720021, 0.05066099998884965], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05457200001046658, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04728099997919344, "p50": 0.050772000008691975, "p90": 0.051271000018005, "mean": 0.04967720000195186, "iqr": 0.003820000017640268, "raw_times": [0.04728099997919344, 0.051271000018005, 0.05161100000350416, 0.050772000008691975, 0.04745100000036473], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05381099998658101, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04900099997939833, "p50": 0.04957199996624695, "p90": 0.05115100003649786, "mean": 0.05033119999779956, "iqr": 0.001620000034563418, "raw_times": [0.04900099997939833, 0.052401000004920206, 0.04957199996624695, 0.05115100003649786, 0.049531000001934444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05343100002619394, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-10-29T04:14:27Z", "run": "294e7842944942f68c6d635847dc1d1f", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04688100000294071, "p50": 0.04992099997025434, "p90": 0.05054100000734252, "mean": 0.049500799991619715, "iqr": 0.0023510000346504967, "raw_times": [0.04688100000294071, 0.04992099997025434, 0.05054100000734252, 0.04818999997269202, 0.051971000004868984], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05505100000391394, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
activation/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,22 +12,17 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the activation kernel
19
- activation = get_kernel("kernels-community/activation")
20
 
21
-
22
- def hf_kernels_swiglu(input_tensor):
23
- hidden_dim = input_tensor.shape[-1] // 2
24
- out_shape = input_tensor.shape[:-1] + (hidden_dim,)
25
- out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
26
- return activation.silu_and_mul(out, input_tensor)
27
 
28
 
29
  run_benchmark(
30
  kernel_type=KernelTypeEnum.ACTIVATION,
31
- impl_name="hf_kernels_swiglu",
32
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
33
- impl_func=hf_kernels_swiglu,
34
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
15
+ import torch, torch.nn.functional as F
16
 
 
 
17
 
18
+ def swiglu_eager(x):
19
+ d = x.shape[-1] // 2
20
+ return F.silu(x[..., :d]) * x[..., d:]
 
 
 
21
 
22
 
23
  run_benchmark(
24
  kernel_type=KernelTypeEnum.ACTIVATION,
25
+ impl_name="torch_eager",
26
+ impl_tags={"family":"hf-kernels", "backend":"eager"},
27
+ impl_func=swiglu_eager,
28
  )
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.21s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 00:36:01 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 29C P0 77W / 350W | 0MiB / 46068MiB | 0% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.21s
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 4.27s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3976,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 74.624us 1850.79% 74.624us 74.624us 1
3980
- hf_kernels_swiglu 11.04% 191.977us 99.56% 1.732ms 1.732ms 0.000us 0.00% 5.440us 5.440us 1
3981
- _activation_beeaae6::silu_and_mul 1.14% 19.900us 85.86% 1.493ms 497.784us 4.032us 100.00% 5.440us 1.813us 3
3982
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.032us 100.00% 4.032us 1.344us 3
3983
- Activity Buffer Request 82.36% 1.432ms 82.36% 1.432ms 1.432ms 1.408us 34.92% 1.408us 1.408us 1
3984
- aten::empty 2.66% 46.201us 2.66% 46.201us 15.400us 0.000us 0.00% 0.000us 0.000us 3
3985
- cudaLaunchKernel 2.36% 41.042us 2.36% 41.042us 13.681us 0.000us 0.00% 0.000us 0.000us 3
3986
- cudaDeviceSynchronize 0.44% 7.690us 0.44% 7.690us 7.690us 0.000us 0.00% 0.000us 0.000us 1
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
- Self CPU time total: 1.739ms
3989
- Self CUDA time total: 4.032us
3990
 
3991
 
3992
 
@@ -3996,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 58.016us 1462.10% 58.016us 58.016us 1
4000
- hf_kernels_swiglu 6.64% 105.933us 99.68% 1.591ms 1.591ms 0.000us 0.00% 5.280us 5.280us 1
4001
- _activation_beeaae6::silu_and_mul 1.34% 21.350us 91.75% 1.465ms 488.260us 3.968us 100.00% 5.280us 1.760us 3
4002
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
4003
- Activity Buffer Request 88.86% 1.419ms 88.86% 1.419ms 1.419ms 1.312us 33.06% 1.312us 1.312us 1
4004
- aten::empty 1.30% 20.712us 1.30% 20.712us 6.904us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaLaunchKernel 1.56% 24.841us 1.56% 24.841us 8.280us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaDeviceSynchronize 0.32% 5.080us 0.32% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 1.597ms
4009
- Self CUDA time total: 3.968us
4010
 
4011
 
4012
 
@@ -4016,17 +4016,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.167us 1371.87% 67.167us 67.167us 1
4020
- hf_kernels_swiglu 6.20% 101.314us 99.65% 1.628ms 1.628ms 0.000us 0.00% 6.560us 6.560us 1
4021
- _activation_beeaae6::silu_and_mul 1.28% 20.850us 92.18% 1.506ms 501.997us 4.896us 100.00% 6.560us 2.187us 3
4022
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
4023
- Activity Buffer Request 89.24% 1.458ms 89.24% 1.458ms 1.458ms 1.664us 33.99% 1.664us 1.664us 1
4024
- aten::empty 1.26% 20.660us 1.26% 20.660us 6.887us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaLaunchKernel 1.67% 27.252us 1.67% 27.252us 9.084us 0.000us 0.00% 0.000us 0.000us 3
4026
- cudaDeviceSynchronize 0.35% 5.710us 0.35% 5.710us 5.710us 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 1.634ms
4029
- Self CUDA time total: 4.896us
4030
 
4031
 
4032
 
@@ -4036,17 +4036,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.055us 1610.42% 69.055us 69.055us 1
4040
- hf_kernels_swiglu 5.98% 106.323us 99.73% 1.773ms 1.773ms 0.000us 0.00% 5.728us 5.728us 1
4041
- _activation_beeaae6::silu_and_mul 1.23% 21.902us 92.63% 1.646ms 548.829us 4.288us 100.00% 5.728us 1.909us 3
4042
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
4043
- Activity Buffer Request 80.11% 1.424ms 80.11% 1.424ms 1.424ms 1.440us 33.58% 1.440us 1.440us 1
4044
- aten::empty 1.11% 19.750us 1.11% 19.750us 6.583us 0.000us 0.00% 0.000us 0.000us 3
4045
- cudaLaunchKernel 11.30% 200.767us 11.30% 200.767us 66.922us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaDeviceSynchronize 0.27% 4.870us 0.27% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 1.777ms
4049
- Self CUDA time total: 4.288us
4050
 
4051
 
4052
 
@@ -4056,17 +4056,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.438us 1043.62% 61.438us 61.438us 1
4060
- hf_kernels_swiglu 19.33% 85.364us 98.97% 437.156us 437.156us 0.000us 0.00% 7.871us 7.871us 1
4061
- _activation_beeaae6::silu_and_mul 4.88% 21.551us 75.28% 332.532us 110.844us 5.887us 100.00% 7.871us 2.624us 3
4062
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.887us 100.00% 5.887us 1.962us 3
4063
- Activity Buffer Request 35.23% 155.635us 35.23% 155.635us 155.635us 1.984us 33.70% 1.984us 1.984us 1
4064
- aten::empty 4.36% 19.260us 4.36% 19.260us 6.420us 0.000us 0.00% 0.000us 0.000us 3
4065
- cudaLaunchKernel 35.17% 155.346us 35.17% 155.346us 51.782us 0.000us 0.00% 0.000us 0.000us 3
4066
- cudaDeviceSynchronize 1.03% 4.560us 1.03% 4.560us 4.560us 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
- Self CPU time total: 441.716us
4069
- Self CUDA time total: 5.887us
4070
 
4071
 
4072
 
@@ -4076,17 +4076,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.160us 828.30% 64.160us 64.160us 1
4080
- hf_kernels_swiglu 7.42% 129.826us 99.74% 1.746ms 1.746ms 0.000us 0.00% 10.339us 10.339us 1
4081
- _activation_beeaae6::silu_and_mul 1.16% 20.220us 91.25% 1.597ms 532.391us 7.746us 100.00% 10.339us 3.446us 3
4082
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.746us 100.00% 7.746us 2.582us 3
4083
- Activity Buffer Request 81.29% 1.423ms 81.29% 1.423ms 1.423ms 2.593us 33.48% 2.593us 2.593us 1
4084
- aten::empty 1.08% 18.840us 1.08% 18.840us 6.280us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaLaunchKernel 8.81% 154.125us 8.81% 154.125us 51.375us 0.000us 0.00% 0.000us 0.000us 3
4086
- cudaDeviceSynchronize 0.26% 4.481us 0.26% 4.481us 4.481us 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- Self CPU time total: 1.750ms
4089
- Self CUDA time total: 7.746us
4090
 
4091
 
4092
 
@@ -4096,17 +4096,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.847us 1069.55% 70.847us 70.847us 1
4100
- hf_kernels_swiglu 6.38% 111.683us 99.73% 1.745ms 1.745ms 0.000us 0.00% 8.832us 8.832us 1
4101
- _activation_beeaae6::silu_and_mul 1.20% 21.011us 92.19% 1.613ms 537.758us 6.624us 100.00% 8.832us 2.944us 3
4102
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 100.00% 6.624us 2.208us 3
4103
- Activity Buffer Request 82.19% 1.438ms 82.19% 1.438ms 1.438ms 2.208us 33.33% 2.208us 2.208us 1
4104
- aten::empty 1.16% 20.281us 1.16% 20.281us 6.760us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaLaunchKernel 8.80% 153.915us 8.80% 153.915us 51.305us 0.000us 0.00% 0.000us 0.000us 3
4106
- cudaDeviceSynchronize 0.27% 4.700us 0.27% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
- Self CPU time total: 1.750ms
4109
- Self CUDA time total: 6.624us
4110
 
4111
 
4112
 
@@ -4116,16 +4116,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.070us 668.11% 63.070us 63.070us 1
4120
- hf_kernels_swiglu 18.75% 87.072us 98.86% 459.026us 459.026us 0.000us 0.00% 12.608us 12.608us 1
4121
- _activation_beeaae6::silu_and_mul 4.59% 21.321us 76.16% 353.653us 117.884us 9.440us 100.00% 12.608us 4.203us 3
4122
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3
4123
- Activity Buffer Request 38.99% 181.046us 38.99% 181.046us 181.046us 3.168us 33.56% 3.168us 3.168us 1
4124
- aten::empty 3.94% 18.301us 3.94% 18.301us 6.100us 0.000us 0.00% 0.000us 0.000us 3
4125
- cudaLaunchKernel 32.58% 151.286us 32.58% 151.286us 50.429us 0.000us 0.00% 0.000us 0.000us 3
4126
- cudaDeviceSynchronize 1.14% 5.310us 1.14% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
- Self CPU time total: 464.336us
4129
  Self CUDA time total: 9.440us
4130
 
4131
 
@@ -4136,23 +4136,23 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4136
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4137
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.326us 483.85% 63.326us 63.326us 1
4140
- hf_kernels_swiglu 16.17% 100.313us 99.24% 615.771us 615.771us 0.000us 0.00% 17.472us 17.472us 1
4141
- _activation_beeaae6::silu_and_mul 3.48% 21.570us 80.17% 497.486us 165.829us 13.088us 100.00% 17.472us 5.824us 3
4142
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.088us 100.00% 13.088us 4.363us 3
4143
- Activity Buffer Request 52.45% 325.441us 52.45% 325.441us 325.441us 4.384us 33.50% 4.384us 4.384us 1
4144
- aten::empty 2.90% 17.972us 2.90% 17.972us 5.991us 0.000us 0.00% 0.000us 0.000us 3
4145
- cudaLaunchKernel 24.25% 150.475us 24.25% 150.475us 50.158us 0.000us 0.00% 0.000us 0.000us 3
4146
- cudaDeviceSynchronize 0.76% 4.730us 0.76% 4.730us 4.730us 0.000us 0.00% 0.000us 0.000us 1
4147
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4148
- Self CPU time total: 620.501us
4149
- Self CUDA time total: 13.088us
4150
 
4151
 
4152
  impl wl p50(ms) ok
4153
  hf_kernels_swiglu cuda_T128_D1024 0.03 True
4154
  hf_kernels_swiglu cuda_T128_D2048 0.03 True
4155
- hf_kernels_swiglu cuda_T128_D768 0.02 True
4156
  hf_kernels_swiglu cuda_T256_D1024 0.03 True
4157
  hf_kernels_swiglu cuda_T256_D2048 0.03 True
4158
  hf_kernels_swiglu cuda_T256_D768 0.03 True
@@ -4163,12 +4163,60 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4163
  <div class="uv-install-logs" id="uv-logs-benchmark">
4164
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4165
  <div class="uv-logs-content" style="display: none;">
4166
- Installed 15 packages in 15ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4167
  </div>
4168
  </div>
4169
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4170
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 12.38it/s]
4171
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 17.32it/s]</div>
4172
  <div class="cell-artifacts">
4173
  <h4>Artifacts:</h4>
4174
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.28s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 04:12:56 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 27C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 32.53s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 84.479us 2079.23% 84.479us 84.479us 1
3980
+ hf_kernels_swiglu 10.30% 179.633us 99.61% 1.737ms 1.737ms 0.000us 0.00% 5.471us 5.471us 1
3981
+ _activation_beeaae6::silu_and_mul 1.22% 21.351us 86.54% 1.509ms 502.938us 4.063us 100.00% 5.471us 1.824us 3
3982
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.063us 100.00% 4.063us 1.354us 3
3983
+ Activity Buffer Request 82.53% 1.439ms 82.53% 1.439ms 1.439ms 1.408us 34.65% 1.408us 1.408us 1
3984
+ aten::empty 2.76% 48.131us 2.76% 48.131us 16.044us 0.000us 0.00% 0.000us 0.000us 3
3985
+ cudaLaunchKernel 2.78% 48.541us 2.78% 48.541us 16.180us 0.000us 0.00% 0.000us 0.000us 3
3986
+ cudaDeviceSynchronize 0.39% 6.861us 0.39% 6.861us 6.861us 0.000us 0.00% 0.000us 0.000us 1
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ Self CPU time total: 1.743ms
3989
+ Self CUDA time total: 4.063us
3990
 
3991
 
3992
 
 
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.383us 1622.96% 64.383us 64.383us 1
4000
+ hf_kernels_swiglu 5.77% 91.273us 99.69% 1.576ms 1.576ms 0.000us 0.00% 5.311us 5.311us 1
4001
+ _activation_beeaae6::silu_and_mul 1.42% 22.508us 92.74% 1.466ms 488.714us 3.967us 100.00% 5.311us 1.770us 3
4002
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
4003
+ Activity Buffer Request 89.71% 1.418ms 89.71% 1.418ms 1.418ms 1.344us 33.88% 1.344us 1.344us 1
4004
+ aten::empty 1.18% 18.580us 1.18% 18.580us 6.193us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaLaunchKernel 1.61% 25.442us 1.61% 25.442us 8.481us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaDeviceSynchronize 0.31% 4.900us 0.31% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Self CPU time total: 1.581ms
4009
+ Self CUDA time total: 3.967us
4010
 
4011
 
4012
 
 
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.375us 1326.60% 65.375us 65.375us 1
4020
+ hf_kernels_swiglu 5.63% 88.392us 99.68% 1.565ms 1.565ms 0.000us 0.00% 6.592us 6.592us 1
4021
+ _activation_beeaae6::silu_and_mul 1.42% 22.341us 92.82% 1.457ms 485.598us 4.928us 100.00% 6.592us 2.197us 3
4022
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3
4023
+ Activity Buffer Request 89.75% 1.409ms 89.75% 1.409ms 1.409ms 1.664us 33.77% 1.664us 1.664us 1
4024
+ aten::empty 1.23% 19.370us 1.23% 19.370us 6.457us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaLaunchKernel 1.64% 25.701us 1.64% 25.701us 8.567us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaDeviceSynchronize 0.32% 5.010us 0.32% 5.010us 5.010us 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 1.570ms
4029
+ Self CUDA time total: 4.928us
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.864us 1618.05% 68.864us 68.864us 1
4040
+ hf_kernels_swiglu 5.06% 90.612us 99.72% 1.787ms 1.787ms 0.000us 0.00% 5.696us 5.696us 1
4041
+ _activation_beeaae6::silu_and_mul 1.27% 22.842us 93.53% 1.676ms 558.683us 4.256us 100.00% 5.696us 1.899us 3
4042
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
4043
+ Activity Buffer Request 78.82% 1.412ms 78.82% 1.412ms 1.412ms 1.440us 33.83% 1.440us 1.440us 1
4044
+ aten::empty 1.13% 20.320us 1.13% 20.320us 6.773us 0.000us 0.00% 0.000us 0.000us 3
4045
+ cudaLaunchKernel 13.43% 240.735us 13.43% 240.735us 80.245us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaDeviceSynchronize 0.28% 5.081us 0.28% 5.081us 5.081us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 1.792ms
4049
+ Self CUDA time total: 4.256us
4050
 
4051
 
4052
 
 
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.014us 1176.71% 70.014us 70.014us 1
4060
+ hf_kernels_swiglu 5.43% 92.861us 99.73% 1.704ms 1.704ms 0.000us 0.00% 7.933us 7.933us 1
4061
+ _activation_beeaae6::silu_and_mul 1.32% 22.490us 93.06% 1.590ms 530.025us 5.950us 100.00% 7.933us 2.644us 3
4062
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.950us 100.00% 5.950us 1.983us 3
4063
+ Activity Buffer Request 82.71% 1.413ms 82.71% 1.413ms 1.413ms 1.983us 33.33% 1.983us 1.983us 1
4064
+ aten::empty 1.24% 21.111us 1.24% 21.111us 7.037us 0.000us 0.00% 0.000us 0.000us 3
4065
+ cudaLaunchKernel 9.03% 154.323us 9.03% 154.323us 51.441us 0.000us 0.00% 0.000us 0.000us 3
4066
+ cudaDeviceSynchronize 0.27% 4.600us 0.27% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
+ Self CPU time total: 1.709ms
4069
+ Self CUDA time total: 5.950us
4070
 
4071
 
4072
 
 
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 71.711us 918.31% 71.711us 71.711us 1
4080
+ hf_kernels_swiglu 20.20% 91.983us 98.97% 450.570us 450.570us 0.000us 0.00% 10.402us 10.402us 1
4081
+ _activation_beeaae6::silu_and_mul 4.90% 22.310us 74.58% 339.547us 113.182us 7.809us 100.00% 10.402us 3.467us 3
4082
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.809us 100.00% 7.809us 2.603us 3
4083
+ Activity Buffer Request 36.02% 164.004us 36.02% 164.004us 164.004us 2.593us 33.21% 2.593us 2.593us 1
4084
+ aten::empty 4.18% 19.040us 4.18% 19.040us 6.347us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaLaunchKernel 33.66% 153.233us 33.66% 153.233us 51.078us 0.000us 0.00% 0.000us 0.000us 3
4086
+ cudaDeviceSynchronize 1.03% 4.690us 1.03% 4.690us 4.690us 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ Self CPU time total: 455.260us
4089
+ Self CUDA time total: 7.809us
4090
 
4091
 
4092
 
 
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.446us 968.24% 64.446us 64.446us 1
4100
+ hf_kernels_swiglu 19.89% 86.491us 98.92% 430.210us 430.210us 0.000us 0.00% 8.897us 8.897us 1
4101
+ _activation_beeaae6::silu_and_mul 5.08% 22.091us 74.70% 324.868us 108.289us 6.656us 100.00% 8.897us 2.966us 3
4102
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.656us 100.00% 6.656us 2.219us 3
4103
+ Activity Buffer Request 34.88% 151.694us 34.88% 151.694us 151.694us 2.241us 33.67% 2.241us 2.241us 1
4104
+ aten::empty 4.33% 18.851us 4.33% 18.851us 6.284us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaLaunchKernel 34.74% 151.083us 34.74% 151.083us 50.361us 0.000us 0.00% 0.000us 0.000us 3
4106
+ cudaDeviceSynchronize 1.08% 4.700us 1.08% 4.700us 4.700us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
+ Self CPU time total: 434.910us
4109
+ Self CUDA time total: 6.656us
4110
 
4111
 
4112
 
 
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.471us 735.92% 69.471us 69.471us 1
4120
+ hf_kernels_swiglu 5.54% 94.743us 99.69% 1.705ms 1.705ms 0.000us 0.00% 12.608us 12.608us 1
4121
+ _activation_beeaae6::silu_and_mul 1.25% 21.451us 93.03% 1.592ms 530.512us 9.440us 100.00% 12.608us 4.203us 3
4122
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3
4123
+ Activity Buffer Request 82.96% 1.419ms 82.96% 1.419ms 1.419ms 3.168us 33.56% 3.168us 3.168us 1
4124
+ aten::empty 1.12% 19.220us 1.12% 19.220us 6.407us 0.000us 0.00% 0.000us 0.000us 3
4125
+ cudaLaunchKernel 8.81% 150.793us 8.81% 150.793us 50.264us 0.000us 0.00% 0.000us 0.000us 3
4126
+ cudaDeviceSynchronize 0.31% 5.230us 0.31% 5.230us 5.230us 0.000us 0.00% 0.000us 0.000us 1
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
+ Self CPU time total: 1.711ms
4129
  Self CUDA time total: 9.440us
4130
 
4131
 
 
4136
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4137
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.606us 520.41% 68.606us 68.606us 1
4140
+ hf_kernels_swiglu 20.98% 86.561us 98.91% 408.129us 408.129us 0.000us 0.00% 17.599us 17.599us 1
4141
+ _activation_beeaae6::silu_and_mul 5.52% 22.769us 73.39% 302.816us 100.939us 13.183us 100.00% 17.599us 5.866us 3
4142
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.183us 100.00% 13.183us 4.394us 3
4143
+ Activity Buffer Request 29.84% 123.113us 29.84% 123.113us 123.113us 4.416us 33.50% 4.416us 4.416us 1
4144
+ aten::empty 4.54% 18.752us 4.54% 18.752us 6.251us 0.000us 0.00% 0.000us 0.000us 3
4145
+ cudaLaunchKernel 38.03% 156.934us 38.03% 156.934us 52.311us 0.000us 0.00% 0.000us 0.000us 3
4146
+ cudaDeviceSynchronize 1.09% 4.500us 1.09% 4.500us 4.500us 0.000us 0.00% 0.000us 0.000us 1
4147
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4148
+ Self CPU time total: 412.629us
4149
+ Self CUDA time total: 13.183us
4150
 
4151
 
4152
  impl wl p50(ms) ok
4153
  hf_kernels_swiglu cuda_T128_D1024 0.03 True
4154
  hf_kernels_swiglu cuda_T128_D2048 0.03 True
4155
+ hf_kernels_swiglu cuda_T128_D768 0.03 True
4156
  hf_kernels_swiglu cuda_T256_D1024 0.03 True
4157
  hf_kernels_swiglu cuda_T256_D2048 0.03 True
4158
  hf_kernels_swiglu cuda_T256_D768 0.03 True
 
4163
  <div class="uv-install-logs" id="uv-logs-benchmark">
4164
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4165
  <div class="uv-logs-content" style="display: none;">
4166
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4167
+ Downloading sympy (6.0MiB)
4168
+ Downloading networkx (1.9MiB)
4169
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4170
+ Downloading numpy (16.2MiB)
4171
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4172
+ Downloading setuptools (1.1MiB)
4173
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4174
+ Downloading kiwisolver (1.4MiB)
4175
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4176
+ Downloading nvidia-curand-cu12 (60.7MiB)
4177
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4178
+ Downloading torch (846.9MiB)
4179
+ Downloading hf-xet (3.2MiB)
4180
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4181
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4182
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4183
+ Downloading fonttools (4.7MiB)
4184
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4185
+ Downloading triton (148.3MiB)
4186
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4187
+ Downloading pillow (6.7MiB)
4188
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4189
+ Downloading matplotlib (8.3MiB)
4190
+ Downloading nvidia-cufile-cu12
4191
+ Downloading kiwisolver
4192
+ Downloading hf-xet
4193
+ Downloading setuptools
4194
+ Downloading fonttools
4195
+ Downloading networkx
4196
+ Downloading pillow
4197
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4198
+ Downloading matplotlib
4199
+ Downloading nvidia-cuda-cupti-cu12
4200
+ Downloading numpy
4201
+ Downloading sympy
4202
+ Downloading nvidia-nvjitlink-cu12
4203
+ Downloading nvidia-curand-cu12
4204
+ Downloading nvidia-cuda-nvrtc-cu12
4205
+ Downloading triton
4206
+ Downloading nvidia-cufft-cu12
4207
+ Downloading nvidia-cusolver-cu12
4208
+ Downloading nvidia-cusparse-cu12
4209
+ Downloading nvidia-cusparselt-cu12
4210
+ Downloading nvidia-nccl-cu12
4211
+ Downloading nvidia-cublas-cu12
4212
+ Downloading nvidia-cudnn-cu12
4213
+ Downloading torch
4214
+ Installed 52 packages in 206ms
4215
  </div>
4216
  </div>
4217
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4218
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 12.63it/s]
4219
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 17.67it/s]</div>
4220
  <div class="cell-artifacts">
4221
  <h4>Artifacts:</h4>
4222
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/torch_swiglu.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.21s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.21s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 00:36:01 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.21s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 29C P0 77W / 350W | 0MiB / 46068MiB | 0% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3918,9 +3918,9 @@ Cell: nv | 0.21s
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 6.96s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 208.254us 1623.18% 208.254us 208.254us 1
3974
- torch_eager 11.63% 222.938us 99.53% 1.908ms 1.908ms 0.000us 0.00% 15.165us 15.165us 1
3975
- aten::silu 3.35% 64.173us 81.27% 1.558ms 519.434us 6.558us 51.11% 8.893us 2.964us 3
3976
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.558us 51.11% 6.558us 2.186us 3
3977
- aten::mul 2.01% 38.591us 3.22% 61.711us 20.570us 6.272us 48.89% 6.272us 2.091us 3
3978
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 48.89% 6.272us 2.091us 3
3979
- Activity Buffer Request 75.51% 1.448ms 75.51% 1.448ms 1.448ms 2.335us 18.20% 2.335us 2.335us 1
3980
- aten::slice 2.75% 52.771us 3.41% 65.422us 10.904us 0.000us 0.00% 0.000us 0.000us 6
3981
- aten::as_strided 0.66% 12.651us 0.66% 12.651us 2.108us 0.000us 0.00% 0.000us 0.000us 6
3982
- cudaLaunchKernel 3.62% 69.391us 3.62% 69.391us 11.565us 0.000us 0.00% 0.000us 0.000us 6
3983
- cudaDeviceSynchronize 0.47% 9.050us 0.47% 9.050us 9.050us 0.000us 0.00% 0.000us 0.000us 1
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
- Self CPU time total: 1.917ms
3986
- Self CUDA time total: 12.830us
3987
 
3988
 
3989
 
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.071us 1219.79% 151.071us 151.071us 1
3997
- torch_eager 7.39% 126.424us 99.65% 1.704ms 1.704ms 0.000us 0.00% 14.561us 14.561us 1
3998
- aten::silu 2.37% 40.550us 87.76% 1.501ms 500.240us 6.400us 51.68% 8.576us 2.859us 3
3999
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 51.68% 6.400us 2.133us 3
4000
- aten::mul 1.49% 25.470us 2.58% 44.190us 14.730us 5.985us 48.32% 5.985us 1.995us 3
4001
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.985us 48.32% 5.985us 1.995us 3
4002
- Activity Buffer Request 83.86% 1.434ms 83.86% 1.434ms 1.434ms 2.176us 17.57% 2.176us 2.176us 1
4003
- aten::slice 1.55% 26.493us 1.91% 32.623us 5.437us 0.000us 0.00% 0.000us 0.000us 6
4004
- aten::as_strided 0.36% 6.130us 0.36% 6.130us 1.022us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaLaunchKernel 2.63% 44.922us 2.63% 44.922us 7.487us 0.000us 0.00% 0.000us 0.000us 6
4006
- cudaDeviceSynchronize 0.35% 5.980us 0.35% 5.980us 5.980us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 1.710ms
4009
- Self CUDA time total: 12.385us
4010
 
4011
 
4012
 
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 154.943us 1178.09% 154.943us 154.943us 1
4020
- torch_eager 7.25% 123.104us 99.64% 1.692ms 1.692ms 0.000us 0.00% 15.424us 15.424us 1
4021
- aten::silu 2.33% 39.532us 87.79% 1.491ms 496.854us 6.784us 51.58% 9.056us 3.019us 3
4022
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.58% 6.784us 2.261us 3
4023
- aten::mul 1.58% 26.910us 2.71% 46.021us 15.340us 6.368us 48.42% 6.368us 2.123us 3
4024
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.368us 48.42% 6.368us 2.123us 3
4025
- Activity Buffer Request 83.90% 1.424ms 83.90% 1.424ms 1.424ms 2.272us 17.27% 2.272us 2.272us 1
4026
- aten::slice 1.53% 26.021us 1.89% 32.121us 5.353us 0.000us 0.00% 0.000us 0.000us 6
4027
- aten::as_strided 0.36% 6.100us 0.36% 6.100us 1.017us 0.000us 0.00% 0.000us 0.000us 6
4028
- cudaLaunchKernel 2.69% 45.642us 2.69% 45.642us 7.607us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaDeviceSynchronize 0.36% 6.080us 0.36% 6.080us 6.080us 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
- Self CPU time total: 1.698ms
4032
- Self CUDA time total: 13.152us
4033
 
4034
 
4035
 
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 180.768us 1415.79% 180.768us 180.768us 1
4043
- torch_eager 7.93% 123.526us 99.68% 1.554ms 1.554ms 0.000us 0.00% 14.976us 14.976us 1
4044
- aten::silu 3.24% 50.441us 85.53% 1.333ms 444.348us 6.592us 51.63% 8.800us 2.933us 3
4045
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 51.63% 6.592us 2.197us 3
4046
- aten::mul 1.75% 27.260us 4.09% 63.791us 21.264us 6.176us 48.37% 6.176us 2.059us 3
4047
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.37% 6.176us 2.059us 3
4048
- Activity Buffer Request 67.46% 1.051ms 67.46% 1.051ms 1.051ms 2.208us 17.29% 2.208us 2.208us 1
4049
- aten::slice 1.70% 26.549us 2.13% 33.261us 5.543us 0.000us 0.00% 0.000us 0.000us 6
4050
- aten::as_strided 0.43% 6.712us 0.43% 6.712us 1.119us 0.000us 0.00% 0.000us 0.000us 6
4051
- cudaLaunchKernel 17.18% 267.779us 17.18% 267.779us 44.630us 0.000us 0.00% 0.000us 0.000us 6
4052
- cudaDeviceSynchronize 0.32% 4.940us 0.32% 4.940us 4.940us 0.000us 0.00% 0.000us 0.000us 1
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
- Self CPU time total: 1.559ms
4055
- Self CUDA time total: 12.768us
4056
 
4057
 
4058
 
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.816us 1138.41% 150.816us 150.816us 1
4066
- torch_eager 6.24% 117.054us 99.74% 1.872ms 1.872ms 0.000us 0.00% 15.520us 15.520us 1
4067
- aten::silu 2.12% 39.802us 89.47% 1.679ms 559.729us 6.784us 51.21% 9.056us 3.019us 3
4068
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.21% 6.784us 2.261us 3
4069
- aten::mul 1.34% 25.111us 2.35% 44.062us 14.687us 6.464us 48.79% 6.464us 2.155us 3
4070
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.79% 6.464us 2.155us 3
4071
- Activity Buffer Request 75.90% 1.425ms 75.90% 1.425ms 1.425ms 2.272us 17.15% 2.272us 2.272us 1
4072
- aten::slice 1.36% 25.472us 1.68% 31.591us 5.265us 0.000us 0.00% 0.000us 0.000us 6
4073
- aten::as_strided 0.33% 6.119us 0.33% 6.119us 1.020us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaLaunchKernel 12.46% 233.778us 12.46% 233.778us 38.963us 0.000us 0.00% 0.000us 0.000us 6
4075
- cudaDeviceSynchronize 0.26% 4.950us 0.26% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- Self CPU time total: 1.877ms
4078
- Self CUDA time total: 13.248us
4079
 
4080
 
4081
 
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.615us 923.45% 143.615us 143.615us 1
4089
- torch_eager 17.00% 110.812us 99.16% 646.262us 646.262us 0.000us 0.00% 18.240us 18.240us 1
4090
- aten::silu 6.35% 41.393us 70.99% 462.667us 154.222us 7.936us 51.03% 10.624us 3.541us 3
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
4092
- aten::mul 3.56% 23.221us 6.51% 42.412us 14.137us 7.616us 48.97% 7.616us 2.539us 3
4093
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
4094
- Activity Buffer Request 32.67% 212.907us 32.67% 212.907us 212.907us 2.688us 17.28% 2.688us 2.688us 1
4095
- aten::slice 3.77% 24.551us 4.66% 30.371us 5.062us 0.000us 0.00% 0.000us 0.000us 6
4096
- aten::as_strided 0.89% 5.820us 0.89% 5.820us 0.970us 0.000us 0.00% 0.000us 0.000us 6
4097
- cudaLaunchKernel 34.91% 227.558us 34.91% 227.558us 37.926us 0.000us 0.00% 0.000us 0.000us 6
4098
- cudaDeviceSynchronize 0.84% 5.490us 0.84% 5.490us 5.490us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
- Self CPU time total: 651.752us
4101
- Self CUDA time total: 15.552us
4102
 
4103
 
4104
 
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.197us 1080.16% 155.197us 155.197us 1
4112
- torch_eager 6.30% 118.195us 99.70% 1.872ms 1.872ms 0.000us 0.00% 16.864us 16.864us 1
4113
- aten::silu 2.16% 40.640us 89.31% 1.677ms 558.889us 7.360us 51.22% 9.856us 3.285us 3
4114
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.22% 7.360us 2.453us 3
4115
- aten::mul 1.39% 26.190us 2.47% 46.331us 15.444us 7.008us 48.78% 7.008us 2.336us 3
4116
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.78% 7.008us 2.336us 3
4117
- Activity Buffer Request 76.28% 1.432ms 76.28% 1.432ms 1.432ms 2.496us 17.37% 2.496us 2.496us 1
4118
- aten::slice 1.31% 24.671us 1.64% 30.721us 5.120us 0.000us 0.00% 0.000us 0.000us 6
4119
- aten::as_strided 0.32% 6.050us 0.32% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
4120
- cudaLaunchKernel 11.93% 224.049us 11.93% 224.049us 37.341us 0.000us 0.00% 0.000us 0.000us 6
4121
- cudaDeviceSynchronize 0.30% 5.540us 0.30% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
- Self CPU time total: 1.877ms
4124
- Self CUDA time total: 14.368us
4125
 
4126
 
4127
 
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4133
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4134
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 144.252us 927.61% 144.252us 144.252us 1
4135
- torch_eager 18.42% 116.554us 99.16% 627.471us 627.471us 0.000us 0.00% 18.239us 18.239us 1
4136
- aten::silu 6.52% 41.251us 69.31% 438.595us 146.198us 7.968us 51.24% 10.656us 3.552us 3
4137
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.24% 7.968us 2.656us 3
4138
- aten::mul 3.66% 23.182us 6.58% 41.632us 13.877us 7.583us 48.76% 7.583us 2.528us 3
4139
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.583us 48.76% 7.583us 2.528us 3
4140
- Activity Buffer Request 30.96% 195.937us 30.96% 195.937us 195.937us 2.688us 17.29% 2.688us 2.688us 1
4141
- aten::slice 3.89% 24.640us 4.85% 30.690us 5.115us 0.000us 0.00% 0.000us 0.000us 6
4142
- aten::as_strided 0.96% 6.050us 0.96% 6.050us 1.008us 0.000us 0.00% 0.000us 0.000us 6
4143
- cudaLaunchKernel 34.74% 219.857us 34.74% 219.857us 36.643us 0.000us 0.00% 0.000us 0.000us 6
4144
- cudaDeviceSynchronize 0.84% 5.310us 0.84% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
- Self CPU time total: 632.781us
4147
- Self CUDA time total: 15.551us
4148
 
4149
 
4150
 
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4154
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4155
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4156
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4157
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 150.463us 665.09% 150.463us 150.463us 1
4158
- torch_eager 5.93% 109.544us 99.69% 1.842ms 1.842ms 0.000us 0.00% 26.527us 26.527us 1
4159
- aten::silu 2.24% 41.413us 89.69% 1.657ms 552.422us 11.584us 51.20% 15.488us 5.163us 3
4160
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.584us 51.20% 11.584us 3.861us 3
4161
- aten::mul 1.32% 24.310us 2.35% 43.432us 14.477us 11.039us 48.80% 11.039us 3.680us 3
4162
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.039us 48.80% 11.039us 3.680us 3
4163
- Activity Buffer Request 76.49% 1.413ms 76.49% 1.413ms 1.413ms 3.904us 17.26% 3.904us 3.904us 1
4164
- aten::slice 1.39% 25.640us 1.72% 31.740us 5.290us 0.000us 0.00% 0.000us 0.000us 6
4165
- aten::as_strided 0.33% 6.100us 0.33% 6.100us 1.017us 0.000us 0.00% 0.000us 0.000us 6
4166
- cudaLaunchKernel 12.00% 221.728us 12.00% 221.728us 36.955us 0.000us 0.00% 0.000us 0.000us 6
4167
- cudaDeviceSynchronize 0.31% 5.690us 0.31% 5.690us 5.690us 0.000us 0.00% 0.000us 0.000us 1
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
- Self CPU time total: 1.848ms
4170
- Self CUDA time total: 22.623us
4171
 
4172
 
4173
  impl wl p50(ms) ok
@@ -4181,12 +4181,6 @@ torch_eager cuda_T512_D1024 0.05 True
4181
  torch_eager cuda_T512_D2048 0.05 True
4182
  torch_eager cuda_T512_D768 0.05 True
4183
  </pre></div>
4184
- <div class="uv-install-logs" id="uv-logs-benchmark">
4185
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4186
- <div class="uv-logs-content" style="display: none;">
4187
- Installed 37 packages in 235ms
4188
- </div>
4189
- </div>
4190
  <div class="cell-artifacts">
4191
  <h4>Artifacts:</h4>
4192
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.28s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 04:12:56 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 27C P8 22W / 350W | 0MiB / 46068MiB | 0% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 3.39s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 208.672us 1646.72% 208.672us 208.672us 1
3974
+ torch_eager 11.52% 217.973us 99.62% 1.885ms 1.885ms 0.000us 0.00% 14.976us 14.976us 1
3975
+ aten::silu 3.07% 58.081us 81.78% 1.547ms 515.694us 6.464us 51.01% 8.768us 2.923us 3
3976
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 51.01% 6.464us 2.155us 3
3977
+ aten::mul 1.91% 36.092us 3.28% 62.082us 20.694us 6.208us 48.99% 6.208us 2.069us 3
3978
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.99% 6.208us 2.069us 3
3979
+ Activity Buffer Request 76.33% 1.444ms 76.33% 1.444ms 1.444ms 2.304us 18.18% 2.304us 2.304us 1
3980
+ aten::slice 2.46% 46.622us 3.04% 57.552us 9.592us 0.000us 0.00% 0.000us 0.000us 6
3981
+ aten::as_strided 0.58% 10.930us 0.58% 10.930us 1.822us 0.000us 0.00% 0.000us 0.000us 6
3982
+ cudaLaunchKernel 3.75% 71.021us 3.75% 71.021us 11.837us 0.000us 0.00% 0.000us 0.000us 6
3983
+ cudaDeviceSynchronize 0.38% 7.160us 0.38% 7.160us 7.160us 0.000us 0.00% 0.000us 0.000us 1
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
+ Self CPU time total: 1.892ms
3986
+ Self CUDA time total: 12.672us
3987
 
3988
 
3989
 
 
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 189.724us 1532.13% 189.724us 189.724us 1
3997
+ torch_eager 7.75% 136.545us 99.70% 1.756ms 1.756ms 0.000us 0.00% 14.559us 14.559us 1
3998
+ aten::silu 2.47% 43.560us 85.85% 1.512ms 503.984us 6.399us 51.68% 8.575us 2.858us 3
3999
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
4000
+ aten::mul 2.87% 50.460us 4.18% 73.560us 24.520us 5.984us 48.32% 5.984us 1.995us 3
4001
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
4002
+ Activity Buffer Request 81.76% 1.440ms 81.76% 1.440ms 1.440ms 2.176us 17.57% 2.176us 2.176us 1
4003
+ aten::slice 1.56% 27.471us 1.92% 33.791us 5.632us 0.000us 0.00% 0.000us 0.000us 6
4004
+ aten::as_strided 0.36% 6.320us 0.36% 6.320us 1.053us 0.000us 0.00% 0.000us 0.000us 6
4005
+ cudaLaunchKernel 2.93% 51.591us 2.93% 51.591us 8.598us 0.000us 0.00% 0.000us 0.000us 6
4006
+ cudaDeviceSynchronize 0.30% 5.280us 0.30% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Self CPU time total: 1.761ms
4009
+ Self CUDA time total: 12.383us
4010
 
4011
 
4012
 
 
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.261us 1202.24% 159.261us 159.261us 1
4020
+ torch_eager 7.59% 133.144us 99.70% 1.749ms 1.749ms 0.000us 0.00% 15.487us 15.487us 1
4021
+ aten::silu 2.45% 42.980us 87.40% 1.533ms 511.158us 6.783us 51.20% 9.023us 3.008us 3
4022
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 51.20% 6.783us 2.261us 3
4023
+ aten::mul 1.60% 28.151us 2.82% 49.551us 16.517us 6.464us 48.80% 6.464us 2.155us 3
4024
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.80% 6.464us 2.155us 3
4025
+ Activity Buffer Request 83.38% 1.463ms 83.38% 1.463ms 1.463ms 2.240us 16.91% 2.240us 2.240us 1
4026
+ aten::slice 1.54% 26.990us 1.89% 33.190us 5.532us 0.000us 0.00% 0.000us 0.000us 6
4027
+ aten::as_strided 0.35% 6.200us 0.35% 6.200us 1.033us 0.000us 0.00% 0.000us 0.000us 6
4028
+ cudaLaunchKernel 2.79% 48.992us 2.79% 48.992us 8.165us 0.000us 0.00% 0.000us 0.000us 6
4029
+ cudaDeviceSynchronize 0.30% 5.190us 0.30% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
+ Self CPU time total: 1.755ms
4032
+ Self CUDA time total: 13.247us
4033
 
4034
 
4035
 
 
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.719us 1246.22% 158.719us 158.719us 1
4043
+ torch_eager 6.58% 125.161us 99.76% 1.897ms 1.897ms 0.000us 0.00% 14.944us 14.944us 1
4044
+ aten::silu 2.27% 43.111us 89.01% 1.692ms 564.032us 6.560us 51.51% 8.768us 2.923us 3
4045
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
4046
+ aten::mul 1.36% 25.870us 2.47% 46.950us 15.650us 6.176us 48.49% 6.176us 2.059us 3
4047
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
4048
+ Activity Buffer Request 75.60% 1.437ms 75.60% 1.437ms 1.437ms 2.208us 17.34% 2.208us 2.208us 1
4049
+ aten::slice 1.39% 26.382us 1.70% 32.293us 5.382us 0.000us 0.00% 0.000us 0.000us 6
4050
+ aten::as_strided 0.31% 5.911us 0.31% 5.911us 0.985us 0.000us 0.00% 0.000us 0.000us 6
4051
+ cudaLaunchKernel 12.25% 232.925us 12.25% 232.925us 38.821us 0.000us 0.00% 0.000us 0.000us 6
4052
+ cudaDeviceSynchronize 0.24% 4.510us 0.24% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ Self CPU time total: 1.901ms
4055
+ Self CUDA time total: 12.736us
4056
 
4057
 
4058
 
 
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.887us 1158.61% 153.887us 153.887us 1
4066
+ torch_eager 6.96% 128.034us 99.73% 1.834ms 1.834ms 0.000us 0.00% 15.586us 15.586us 1
4067
+ aten::silu 2.31% 42.562us 88.63% 1.630ms 543.305us 6.849us 51.57% 9.153us 3.051us 3
4068
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.849us 51.57% 6.849us 2.283us 3
4069
+ aten::mul 1.46% 26.931us 2.44% 44.851us 14.950us 6.433us 48.43% 6.433us 2.144us 3
4070
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.43% 6.433us 2.144us 3
4071
+ Activity Buffer Request 77.32% 1.422ms 77.32% 1.422ms 1.422ms 2.304us 17.35% 2.304us 2.304us 1
4072
+ aten::slice 1.36% 24.939us 1.70% 31.240us 5.207us 0.000us 0.00% 0.000us 0.000us 6
4073
+ aten::as_strided 0.34% 6.301us 0.34% 6.301us 1.050us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaLaunchKernel 9.97% 183.363us 9.97% 183.363us 30.561us 0.000us 0.00% 0.000us 0.000us 6
4075
+ cudaDeviceSynchronize 0.27% 4.900us 0.27% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ Self CPU time total: 1.839ms
4078
+ Self CUDA time total: 13.282us
4079
 
4080
 
4081
 
 
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.278us 1009.23% 157.278us 157.278us 1
4089
+ torch_eager 8.12% 150.915us 99.71% 1.854ms 1.854ms 0.000us 0.00% 18.272us 18.272us 1
4090
+ aten::silu 2.38% 44.260us 87.35% 1.624ms 541.305us 8.000us 51.33% 10.688us 3.563us 3
4091
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 51.33% 8.000us 2.667us 3
4092
+ aten::mul 1.41% 26.151us 2.51% 46.701us 15.567us 7.584us 48.67% 7.584us 2.528us 3
4093
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.67% 7.584us 2.528us 3
4094
+ Activity Buffer Request 76.39% 1.420ms 76.39% 1.420ms 1.420ms 2.688us 17.25% 2.688us 2.688us 1
4095
+ aten::slice 1.39% 25.840us 1.73% 32.160us 5.360us 0.000us 0.00% 0.000us 0.000us 6
4096
+ aten::as_strided 0.34% 6.320us 0.34% 6.320us 1.053us 0.000us 0.00% 0.000us 0.000us 6
4097
+ cudaLaunchKernel 9.68% 179.994us 9.68% 179.994us 29.999us 0.000us 0.00% 0.000us 0.000us 6
4098
+ cudaDeviceSynchronize 0.29% 5.351us 0.29% 5.351us 5.351us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
+ Self CPU time total: 1.859ms
4101
+ Self CUDA time total: 15.584us
4102
 
4103
 
4104
 
 
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 161.728us 1130.65% 161.728us 161.728us 1
4112
+ torch_eager 7.31% 130.302us 99.73% 1.777ms 1.777ms 0.000us 0.00% 16.768us 16.768us 1
4113
+ aten::silu 2.39% 42.651us 87.87% 1.566ms 521.901us 7.328us 51.23% 9.792us 3.264us 3
4114
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3
4115
+ aten::mul 1.55% 27.651us 2.68% 47.751us 15.917us 6.976us 48.77% 6.976us 2.325us 3
4116
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
4117
+ Activity Buffer Request 76.61% 1.365ms 76.61% 1.365ms 1.365ms 2.464us 17.23% 2.464us 2.464us 1
4118
+ aten::slice 1.50% 26.642us 1.87% 33.262us 5.544us 0.000us 0.00% 0.000us 0.000us 6
4119
+ aten::as_strided 0.37% 6.620us 0.37% 6.620us 1.103us 0.000us 0.00% 0.000us 0.000us 6
4120
+ cudaLaunchKernel 9.99% 177.974us 9.99% 177.974us 29.662us 0.000us 0.00% 0.000us 0.000us 6
4121
+ cudaDeviceSynchronize 0.27% 4.870us 0.27% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
+ Self CPU time total: 1.782ms
4124
+ Self CUDA time total: 14.304us
4125
 
4126
 
4127
 
 
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4133
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4134
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.416us 1005.63% 156.416us 156.416us 1
4135
+ torch_eager 7.17% 130.703us 99.74% 1.819ms 1.819ms 0.000us 0.00% 18.243us 18.243us 1
4136
+ aten::silu 2.30% 42.032us 88.31% 1.611ms 536.959us 7.970us 51.24% 10.659us 3.553us 3
4137
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.970us 51.24% 7.970us 2.657us 3
4138
+ aten::mul 1.41% 25.800us 2.54% 46.410us 15.470us 7.584us 48.76% 7.584us 2.528us 3
4139
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.76% 7.584us 2.528us 3
4140
+ Activity Buffer Request 77.37% 1.411ms 77.37% 1.411ms 1.411ms 2.689us 17.29% 2.689us 2.689us 1
4141
+ aten::slice 1.41% 25.640us 1.72% 31.370us 5.228us 0.000us 0.00% 0.000us 0.000us 6
4142
+ aten::as_strided 0.31% 5.730us 0.31% 5.730us 0.955us 0.000us 0.00% 0.000us 0.000us 6
4143
+ cudaLaunchKernel 9.77% 178.145us 9.77% 178.145us 29.691us 0.000us 0.00% 0.000us 0.000us 6
4144
+ cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
+ Self CPU time total: 1.824ms
4147
+ Self CUDA time total: 15.554us
4148
 
4149
 
4150
 
 
4154
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4155
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4156
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4157
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.390us 709.54% 159.390us 159.390us 1
4158
+ torch_eager 6.97% 127.342us 99.74% 1.823ms 1.823ms 0.000us 0.00% 26.336us 26.336us 1
4159
+ aten::silu 2.35% 42.870us 88.50% 1.617ms 539.138us 11.520us 51.28% 15.392us 5.131us 3
4160
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 51.28% 11.520us 3.840us 3
4161
+ aten::mul 1.55% 28.251us 2.57% 47.051us 15.684us 10.944us 48.72% 10.944us 3.648us 3
4162
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.72% 10.944us 3.648us 3
4163
+ Activity Buffer Request 77.70% 1.420ms 77.70% 1.420ms 1.420ms 3.872us 17.24% 3.872us 3.872us 1
4164
+ aten::slice 1.38% 25.151us 1.70% 31.112us 5.185us 0.000us 0.00% 0.000us 0.000us 6
4165
+ aten::as_strided 0.33% 5.961us 0.33% 5.961us 0.993us 0.000us 0.00% 0.000us 0.000us 6
4166
+ cudaLaunchKernel 9.48% 173.263us 9.48% 173.263us 28.877us 0.000us 0.00% 0.000us 0.000us 6
4167
+ cudaDeviceSynchronize 0.26% 4.721us 0.26% 4.721us 4.721us 0.000us 0.00% 0.000us 0.000us 1
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
+ Self CPU time total: 1.828ms
4170
+ Self CUDA time total: 22.464us
4171
 
4172
 
4173
  impl wl p50(ms) ok
 
4181
  torch_eager cuda_T512_D2048 0.05 True
4182
  torch_eager cuda_T512_D768 0.05 True
4183
  </pre></div>
 
 
 
 
 
 
4184
  <div class="cell-artifacts">
4185
  <h4>Artifacts:</h4>
4186
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/results/artifacts/combine/latency.svg CHANGED
activation/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T00:37:20.527749</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4021,83 +4021,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
- <path d="M 60.23 435.838012 L 847.294169 435.838012 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="60.23" y="435.838012" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="439.63723" transform="rotate(-0 53.23 439.63723)">0.025</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
- <path d="M 60.23 362.769477 L 847.294169 362.769477 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="60.23" y="362.769477" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.568696" transform="rotate(-0 53.23 366.568696)">0.030</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
- <path d="M 60.23 289.700943 L 847.294169 289.700943 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="60.23" y="289.700943" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="293.500161" transform="rotate(-0 53.23 293.500161)">0.035</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
- <path d="M 60.23 216.632408 L 847.294169 216.632408 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="60.23" y="216.632408" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="220.431627" transform="rotate(-0 53.23 220.431627)">0.040</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
- <path d="M 60.23 143.563874 L 847.294169 143.563874 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
- <use ns4:href="#m0fca2865ba" x="60.23" y="143.563874" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="147.363093" transform="rotate(-0 53.23 147.363093)">0.045</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
- <path d="M 60.23 70.49534 L 847.294169 70.49534 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
- <use ns4:href="#m0fca2865ba" x="60.23" y="70.49534" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="74.294558" transform="rotate(-0 53.23 74.294558)">0.050</text>
4101
  </g>
4102
  </g>
4103
  <g id="label--y" class="ylabel">
@@ -4105,37 +4105,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4105
  </g>
4106
  </g>
4107
  <g id="series--hf-kernels-swiglu" class="series">
4108
- <path d="M 96.005644 451.16779 L 185.444754 358.9553 L 274.883864 372.107635 L 364.322974 375.176514 L 453.762084 373.569007 L 543.201194 390.228632 L 632.640304 377.36857 L 722.079415 380.729723 L 811.518525 371.669225 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4109
  <defs>
4110
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4111
  </defs>
4112
  <g clip-path="url(#p620c7d392f)">
4113
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4114
- <use ns4:href="#md7efaf3aec" x="185.444754" y="358.9553" style="fill: #1f77b4; stroke: #1f77b4" />
4115
- <use ns4:href="#md7efaf3aec" x="274.883864" y="372.107635" style="fill: #1f77b4; stroke: #1f77b4" />
4116
- <use ns4:href="#md7efaf3aec" x="364.322974" y="375.176514" style="fill: #1f77b4; stroke: #1f77b4" />
4117
- <use ns4:href="#md7efaf3aec" x="453.762084" y="373.569007" style="fill: #1f77b4; stroke: #1f77b4" />
4118
- <use ns4:href="#md7efaf3aec" x="543.201194" y="390.228632" style="fill: #1f77b4; stroke: #1f77b4" />
4119
- <use ns4:href="#md7efaf3aec" x="632.640304" y="377.36857" style="fill: #1f77b4; stroke: #1f77b4" />
4120
- <use ns4:href="#md7efaf3aec" x="722.079415" y="380.729723" style="fill: #1f77b4; stroke: #1f77b4" />
4121
- <use ns4:href="#md7efaf3aec" x="811.518525" y="371.669225" style="fill: #1f77b4; stroke: #1f77b4" />
4122
  </g>
4123
  </g>
4124
  <g id="series--torch-eager" class="series">
4125
- <path d="M 96.005644 194.24421 L 185.444754 51.176019 L 274.883864 48.837826 L 364.322974 47.08418 L 453.762084 58.336735 L 543.201194 78.664401 L 632.640304 63.013121 L 722.079415 82.741626 L 811.518525 86.395052 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
- <use ns4:href="#m9b8c54d372" x="96.005644" y="194.24421" style="fill: #ff7f0e; stroke: #ff7f0e" />
4131
- <use ns4:href="#m9b8c54d372" x="185.444754" y="51.176019" style="fill: #ff7f0e; stroke: #ff7f0e" />
4132
- <use ns4:href="#m9b8c54d372" x="274.883864" y="48.837826" style="fill: #ff7f0e; stroke: #ff7f0e" />
4133
- <use ns4:href="#m9b8c54d372" x="364.322974" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4134
- <use ns4:href="#m9b8c54d372" x="453.762084" y="58.336735" style="fill: #ff7f0e; stroke: #ff7f0e" />
4135
- <use ns4:href="#m9b8c54d372" x="543.201194" y="78.664401" style="fill: #ff7f0e; stroke: #ff7f0e" />
4136
- <use ns4:href="#m9b8c54d372" x="632.640304" y="63.013121" style="fill: #ff7f0e; stroke: #ff7f0e" />
4137
- <use ns4:href="#m9b8c54d372" x="722.079415" y="82.741626" style="fill: #ff7f0e; stroke: #ff7f0e" />
4138
- <use ns4:href="#m9b8c54d372" x="811.518525" y="86.395052" style="fill: #ff7f0e; stroke: #ff7f0e" />
4139
  </g>
4140
  </g>
4141
  <g id="patch_3">
@@ -4193,7 +4193,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4193
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4194
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4195
  </span> |
4196
- Cell: combine | 4.29s
4197
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4198
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4199
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4284,7 +4284,7 @@ COMBINED BENCHMARK SUMMARY
4284
  impl wl p50(ms) ok
4285
  hf_kernels_swiglu cuda_T128_D1024 0.03 True
4286
  hf_kernels_swiglu cuda_T128_D2048 0.03 True
4287
- hf_kernels_swiglu cuda_T128_D768 0.02 True
4288
  hf_kernels_swiglu cuda_T256_D1024 0.03 True
4289
  hf_kernels_swiglu cuda_T256_D2048 0.03 True
4290
  hf_kernels_swiglu cuda_T256_D768 0.03 True
@@ -4319,7 +4319,7 @@ Implementations included:
4319
  <div class="uv-install-logs" id="uv-logs-combine">
4320
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4321
  <div class="uv-logs-content" style="display: none;">
4322
- Installed 37 packages in 208ms
4323
  </div>
4324
  </div>
4325
  <div class="cell-artifacts">
@@ -4332,7 +4332,7 @@ Installed 37 packages in 208ms
4332
  <rdf:RDF>
4333
  <ns2:Work>
4334
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4335
- <dc:date>2025-10-29T00:37:20.527749</dc:date>
4336
  <dc:format>image/svg+xml</dc:format>
4337
  <dc:creator>
4338
  <ns2:Agent>
@@ -4481,83 +4481,83 @@ Installed 37 packages in 208ms
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
- <path d="M 60.23 435.838012 L 847.294169 435.838012 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_10">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
- <use ns4:href="#m0fca2865ba" x="60.23" y="435.838012" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_10">
4495
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="439.63723" transform="rotate(-0 53.23 439.63723)">0.025</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
- <path d="M 60.23 362.769477 L 847.294169 362.769477 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_11">
4503
  <g>
4504
- <use ns4:href="#m0fca2865ba" x="60.23" y="362.769477" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_11">
4508
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="366.568696" transform="rotate(-0 53.23 366.568696)">0.030</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
- <path d="M 60.23 289.700943 L 847.294169 289.700943 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_12">
4516
  <g>
4517
- <use ns4:href="#m0fca2865ba" x="60.23" y="289.700943" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_12">
4521
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="293.500161" transform="rotate(-0 53.23 293.500161)">0.035</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
- <path d="M 60.23 216.632408 L 847.294169 216.632408 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_13">
4529
  <g>
4530
- <use ns4:href="#m0fca2865ba" x="60.23" y="216.632408" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_13">
4534
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="220.431627" transform="rotate(-0 53.23 220.431627)">0.040</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
- <path d="M 60.23 143.563874 L 847.294169 143.563874 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_14">
4542
  <g>
4543
- <use ns4:href="#m0fca2865ba" x="60.23" y="143.563874" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_14">
4547
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="147.363093" transform="rotate(-0 53.23 147.363093)">0.045</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
- <path d="M 60.23 70.49534 L 847.294169 70.49534 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_15">
4555
  <g>
4556
- <use ns4:href="#m0fca2865ba" x="60.23" y="70.49534" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_15">
4560
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="74.294558" transform="rotate(-0 53.23 74.294558)">0.050</text>
4561
  </g>
4562
  </g>
4563
  <g id="label--y" class="ylabel">
@@ -4565,37 +4565,37 @@ Installed 37 packages in 208ms
4565
  </g>
4566
  </g>
4567
  <g id="series--hf-kernels-swiglu" class="series">
4568
- <path d="M 96.005644 451.16779 L 185.444754 358.9553 L 274.883864 372.107635 L 364.322974 375.176514 L 453.762084 373.569007 L 543.201194 390.228632 L 632.640304 377.36857 L 722.079415 380.729723 L 811.518525 371.669225 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4569
  <defs>
4570
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4571
  </defs>
4572
  <g clip-path="url(#p620c7d392f)">
4573
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4574
- <use ns4:href="#md7efaf3aec" x="185.444754" y="358.9553" style="fill: #1f77b4; stroke: #1f77b4" />
4575
- <use ns4:href="#md7efaf3aec" x="274.883864" y="372.107635" style="fill: #1f77b4; stroke: #1f77b4" />
4576
- <use ns4:href="#md7efaf3aec" x="364.322974" y="375.176514" style="fill: #1f77b4; stroke: #1f77b4" />
4577
- <use ns4:href="#md7efaf3aec" x="453.762084" y="373.569007" style="fill: #1f77b4; stroke: #1f77b4" />
4578
- <use ns4:href="#md7efaf3aec" x="543.201194" y="390.228632" style="fill: #1f77b4; stroke: #1f77b4" />
4579
- <use ns4:href="#md7efaf3aec" x="632.640304" y="377.36857" style="fill: #1f77b4; stroke: #1f77b4" />
4580
- <use ns4:href="#md7efaf3aec" x="722.079415" y="380.729723" style="fill: #1f77b4; stroke: #1f77b4" />
4581
- <use ns4:href="#md7efaf3aec" x="811.518525" y="371.669225" style="fill: #1f77b4; stroke: #1f77b4" />
4582
  </g>
4583
  </g>
4584
  <g id="series--torch-eager" class="series">
4585
- <path d="M 96.005644 194.24421 L 185.444754 51.176019 L 274.883864 48.837826 L 364.322974 47.08418 L 453.762084 58.336735 L 543.201194 78.664401 L 632.640304 63.013121 L 722.079415 82.741626 L 811.518525 86.395052 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4586
  <defs>
4587
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4588
  </defs>
4589
  <g clip-path="url(#p620c7d392f)">
4590
- <use ns4:href="#m9b8c54d372" x="96.005644" y="194.24421" style="fill: #ff7f0e; stroke: #ff7f0e" />
4591
- <use ns4:href="#m9b8c54d372" x="185.444754" y="51.176019" style="fill: #ff7f0e; stroke: #ff7f0e" />
4592
- <use ns4:href="#m9b8c54d372" x="274.883864" y="48.837826" style="fill: #ff7f0e; stroke: #ff7f0e" />
4593
- <use ns4:href="#m9b8c54d372" x="364.322974" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4594
- <use ns4:href="#m9b8c54d372" x="453.762084" y="58.336735" style="fill: #ff7f0e; stroke: #ff7f0e" />
4595
- <use ns4:href="#m9b8c54d372" x="543.201194" y="78.664401" style="fill: #ff7f0e; stroke: #ff7f0e" />
4596
- <use ns4:href="#m9b8c54d372" x="632.640304" y="63.013121" style="fill: #ff7f0e; stroke: #ff7f0e" />
4597
- <use ns4:href="#m9b8c54d372" x="722.079415" y="82.741626" style="fill: #ff7f0e; stroke: #ff7f0e" />
4598
- <use ns4:href="#m9b8c54d372" x="811.518525" y="86.395052" style="fill: #ff7f0e; stroke: #ff7f0e" />
4599
  </g>
4600
  </g>
4601
  <g id="patch_3">
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T04:14:49.758878</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
+ <path d="M 60.23 454.34229 L 847.294169 454.34229 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
+ <use ns4:href="#m0fca2865ba" x="60.23" y="454.34229" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="458.141509" transform="rotate(-0 53.23 458.141509)">0.025</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
+ <path d="M 60.23 378.758958 L 847.294169 378.758958 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
+ <use ns4:href="#m0fca2865ba" x="60.23" y="378.758958" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="382.558177" transform="rotate(-0 53.23 382.558177)">0.030</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
+ <path d="M 60.23 303.175626 L 847.294169 303.175626 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
+ <use ns4:href="#m0fca2865ba" x="60.23" y="303.175626" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="306.974844" transform="rotate(-0 53.23 306.974844)">0.035</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
+ <path d="M 60.23 227.592294 L 847.294169 227.592294 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
+ <use ns4:href="#m0fca2865ba" x="60.23" y="227.592294" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="231.391512" transform="rotate(-0 53.23 231.391512)">0.040</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
+ <path d="M 60.23 152.008962 L 847.294169 152.008962 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
+ <use ns4:href="#m0fca2865ba" x="60.23" y="152.008962" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="155.80818" transform="rotate(-0 53.23 155.80818)">0.045</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
+ <path d="M 60.23 76.42563 L 847.294169 76.42563 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
+ <use ns4:href="#m0fca2865ba" x="60.23" y="76.42563" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="80.224848" transform="rotate(-0 53.23 80.224848)">0.050</text>
4101
  </g>
4102
  </g>
4103
  <g id="label--y" class="ylabel">
 
4105
  </g>
4106
  </g>
4107
  <g id="series--hf-kernels-swiglu" class="series">
4108
+ <path d="M 96.005644 451.16779 L 185.444754 350.778008 L 274.883864 379.197341 L 364.322974 380.406674 L 453.762084 374.828625 L 543.201194 374.360008 L 632.640304 380.406674 L 722.079415 389.174341 L 811.518525 390.081341 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4109
  <defs>
4110
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4111
  </defs>
4112
  <g clip-path="url(#p620c7d392f)">
4113
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4114
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="350.778008" style="fill: #1f77b4; stroke: #1f77b4" />
4115
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="379.197341" style="fill: #1f77b4; stroke: #1f77b4" />
4116
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="380.406674" style="fill: #1f77b4; stroke: #1f77b4" />
4117
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="374.828625" style="fill: #1f77b4; stroke: #1f77b4" />
4118
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="374.360008" style="fill: #1f77b4; stroke: #1f77b4" />
4119
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="380.406674" style="fill: #1f77b4; stroke: #1f77b4" />
4120
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="389.174341" style="fill: #1f77b4; stroke: #1f77b4" />
4121
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="390.081341" style="fill: #1f77b4; stroke: #1f77b4" />
4122
  </g>
4123
  </g>
4124
  <g id="series--torch-eager" class="series">
4125
+ <path d="M 96.005644 226.821344 L 185.444754 49.351681 L 274.883864 47.08418 L 364.322974 50.863347 L 453.762084 60.522897 L 543.201194 66.433513 L 632.640304 64.755563 L 722.079415 82.895563 L 811.518525 77.619847 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="226.821344" style="fill: #ff7f0e; stroke: #ff7f0e" />
4131
+ <use ns4:href="#m9b8c54d372" x="185.444754" y="49.351681" style="fill: #ff7f0e; stroke: #ff7f0e" />
4132
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4133
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="50.863347" style="fill: #ff7f0e; stroke: #ff7f0e" />
4134
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="60.522897" style="fill: #ff7f0e; stroke: #ff7f0e" />
4135
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="66.433513" style="fill: #ff7f0e; stroke: #ff7f0e" />
4136
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="64.755563" style="fill: #ff7f0e; stroke: #ff7f0e" />
4137
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="82.895563" style="fill: #ff7f0e; stroke: #ff7f0e" />
4138
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="77.619847" style="fill: #ff7f0e; stroke: #ff7f0e" />
4139
  </g>
4140
  </g>
4141
  <g id="patch_3">
 
4193
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4194
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4195
  </span> |
4196
+ Cell: combine | 4.31s
4197
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4198
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4199
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4284
  impl wl p50(ms) ok
4285
  hf_kernels_swiglu cuda_T128_D1024 0.03 True
4286
  hf_kernels_swiglu cuda_T128_D2048 0.03 True
4287
+ hf_kernels_swiglu cuda_T128_D768 0.03 True
4288
  hf_kernels_swiglu cuda_T256_D1024 0.03 True
4289
  hf_kernels_swiglu cuda_T256_D2048 0.03 True
4290
  hf_kernels_swiglu cuda_T256_D768 0.03 True
 
4319
  <div class="uv-install-logs" id="uv-logs-combine">
4320
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4321
  <div class="uv-logs-content" style="display: none;">
4322
+ Installed 37 packages in 238ms
4323
  </div>
4324
  </div>
4325
  <div class="cell-artifacts">
 
4332
  <rdf:RDF>
4333
  <ns2:Work>
4334
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4335
+ <dc:date>2025-10-29T04:14:49.758878</dc:date>
4336
  <dc:format>image/svg+xml</dc:format>
4337
  <dc:creator>
4338
  <ns2:Agent>
 
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
+ <path d="M 60.23 454.34229 L 847.294169 454.34229 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_10">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
+ <use ns4:href="#m0fca2865ba" x="60.23" y="454.34229" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_10">
4495
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="458.141509" transform="rotate(-0 53.23 458.141509)">0.025</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
+ <path d="M 60.23 378.758958 L 847.294169 378.758958 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_11">
4503
  <g>
4504
+ <use ns4:href="#m0fca2865ba" x="60.23" y="378.758958" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_11">
4508
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="382.558177" transform="rotate(-0 53.23 382.558177)">0.030</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
+ <path d="M 60.23 303.175626 L 847.294169 303.175626 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_12">
4516
  <g>
4517
+ <use ns4:href="#m0fca2865ba" x="60.23" y="303.175626" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_12">
4521
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="306.974844" transform="rotate(-0 53.23 306.974844)">0.035</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
+ <path d="M 60.23 227.592294 L 847.294169 227.592294 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_13">
4529
  <g>
4530
+ <use ns4:href="#m0fca2865ba" x="60.23" y="227.592294" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_13">
4534
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="231.391512" transform="rotate(-0 53.23 231.391512)">0.040</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
+ <path d="M 60.23 152.008962 L 847.294169 152.008962 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_14">
4542
  <g>
4543
+ <use ns4:href="#m0fca2865ba" x="60.23" y="152.008962" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_14">
4547
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="155.80818" transform="rotate(-0 53.23 155.80818)">0.045</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
+ <path d="M 60.23 76.42563 L 847.294169 76.42563 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_15">
4555
  <g>
4556
+ <use ns4:href="#m0fca2865ba" x="60.23" y="76.42563" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_15">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="80.224848" transform="rotate(-0 53.23 80.224848)">0.050</text>
4561
  </g>
4562
  </g>
4563
  <g id="label--y" class="ylabel">
 
4565
  </g>
4566
  </g>
4567
  <g id="series--hf-kernels-swiglu" class="series">
4568
+ <path d="M 96.005644 451.16779 L 185.444754 350.778008 L 274.883864 379.197341 L 364.322974 380.406674 L 453.762084 374.828625 L 543.201194 374.360008 L 632.640304 380.406674 L 722.079415 389.174341 L 811.518525 390.081341 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4569
  <defs>
4570
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4571
  </defs>
4572
  <g clip-path="url(#p620c7d392f)">
4573
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4574
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="350.778008" style="fill: #1f77b4; stroke: #1f77b4" />
4575
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="379.197341" style="fill: #1f77b4; stroke: #1f77b4" />
4576
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="380.406674" style="fill: #1f77b4; stroke: #1f77b4" />
4577
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="374.828625" style="fill: #1f77b4; stroke: #1f77b4" />
4578
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="374.360008" style="fill: #1f77b4; stroke: #1f77b4" />
4579
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="380.406674" style="fill: #1f77b4; stroke: #1f77b4" />
4580
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="389.174341" style="fill: #1f77b4; stroke: #1f77b4" />
4581
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="390.081341" style="fill: #1f77b4; stroke: #1f77b4" />
4582
  </g>
4583
  </g>
4584
  <g id="series--torch-eager" class="series">
4585
+ <path d="M 96.005644 226.821344 L 185.444754 49.351681 L 274.883864 47.08418 L 364.322974 50.863347 L 453.762084 60.522897 L 543.201194 66.433513 L 632.640304 64.755563 L 722.079415 82.895563 L 811.518525 77.619847 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4586
  <defs>
4587
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4588
  </defs>
4589
  <g clip-path="url(#p620c7d392f)">
4590
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="226.821344" style="fill: #ff7f0e; stroke: #ff7f0e" />
4591
+ <use ns4:href="#m9b8c54d372" x="185.444754" y="49.351681" style="fill: #ff7f0e; stroke: #ff7f0e" />
4592
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4593
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="50.863347" style="fill: #ff7f0e; stroke: #ff7f0e" />
4594
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="60.522897" style="fill: #ff7f0e; stroke: #ff7f0e" />
4595
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="66.433513" style="fill: #ff7f0e; stroke: #ff7f0e" />
4596
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="64.755563" style="fill: #ff7f0e; stroke: #ff7f0e" />
4597
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="82.895563" style="fill: #ff7f0e; stroke: #ff7f0e" />
4598
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="77.619847" style="fill: #ff7f0e; stroke: #ff7f0e" />
4599
  </g>
4600
  </g>
4601
  <g id="patch_3">
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.046111000017390325, "p50": 0.046270999973785365, "p90": 0.04740100001754399, "mean": 0.04670720001058726, "iqr": 0.001160000010713702, "raw_times": [0.047512000037386315, 0.04740100001754399, 0.04624100000683029, 0.046270999973785365, 0.046111000017390325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05871199999774035, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05225199998903918, "p50": 0.053462000039417035, "p90": 0.053592000028857, "mean": 0.05365380001194353, "iqr": 0.0002100000529026147, "raw_times": [0.053462000039417035, 0.055581000026450056, 0.053592000028857, 0.053381999975954386, 0.05225199998903918], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0581319999923835, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05121200001667603, "p50": 0.05470199999990655, "p90": 0.05482099999198908, "mean": 0.05431980000594194, "iqr": 0.0013289999856169743, "raw_times": [0.05121200001667603, 0.057372000014765945, 0.05470199999990655, 0.05482099999198908, 0.05349200000637211], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056541999981618574, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05210199998373355, "p50": 0.05333199999313365, "p90": 0.05396199998131124, "mean": 0.05322599998862643, "iqr": 0.0016399999935856613, "raw_times": [0.05210199998373355, 0.05333199999313365, 0.05396199998131124, 0.052321999987725576, 0.05441199999722812], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09094299997514099, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05103099999814731, "p50": 0.05309199997327596, "p90": 0.053381999975954386, "mean": 0.05291379998197954, "iqr": 0.0004199999921183917, "raw_times": [0.053381999975954386, 0.052961999983835994, 0.05103099999814731, 0.05309199997327596, 0.054101999978684034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05603199997494812, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051181999992877536, "p50": 0.05189199998767435, "p90": 0.05201199996918149, "mean": 0.052023999978700886, "iqr": 0.0004999999987376214, "raw_times": [0.05151199997044387, 0.05352199997332718, 0.05189199998767435, 0.05201199996918149, 0.051181999992877536], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055981999992127385, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05042200001525998, "p50": 0.052002000018092076, "p90": 0.05382199998393844, "mean": 0.05366420000427752, "iqr": 0.00333999997792489, "raw_times": [0.05048200000601355, 0.05042200001525998, 0.052002000018092076, 0.06159299999808354, 0.05382199998393844], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05433199999060889, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0522220000220841, "p50": 0.053632000003744906, "p90": 0.05870200004665094, "mean": 0.056078200009324064, "iqr": 0.005690000079994206, "raw_times": [0.0522220000220841, 0.06282300000748364, 0.053632000003744906, 0.05301199996665673, 0.05870200004665094], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.055741999972269696, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05032100000335049, "p50": 0.050921000024573004, "p90": 0.05318199998782802, "mean": 0.05303959999309882, "iqr": 0.0023800000121809717, "raw_times": [0.05080199997564705, 0.050921000024573004, 0.05032100000335049, 0.059971999974095525, 0.05318199998782802], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0550720000092042, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05211199999166638, "p50": 0.05235200001152407, "p90": 0.053132000005007285, "mean": 0.05707820000679931, "iqr": 0.0008700000080352766, "raw_times": [0.05235200001152407, 0.05226199999697201, 0.053132000005007285, 0.07553300002882679, 0.05211199999166638], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05610199997363452, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0512720000074296, "p50": 0.0524320000181433, "p90": 0.05278200001157529, "mean": 0.05529400000341411, "iqr": 0.000919999990856013, "raw_times": [0.05278200001157529, 0.0524320000181433, 0.0512720000074296, 0.0681219999592031, 0.05186200002071928], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05547199998545693, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05112100001269937, "p50": 0.051342000006115995, "p90": 0.05172099997707846, "mean": 0.053885599993463984, "iqr": 0.00040899999476096127, "raw_times": [0.05112100001269937, 0.06393199998910859, 0.05172099997707846, 0.0513119999823175, 0.051342000006115995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055091999968226446, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050531999988834286, "p50": 0.05176199999823439, "p90": 0.051821999988987955, "mean": 0.05163600000059887, "iqr": 0.0003099999617006688, "raw_times": [0.050531999988834286, 0.05176199999823439, 0.052551999999650434, 0.051821999988987955, 0.051512000027287286], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055182000039621926, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05124200004047452, "p50": 0.05148200000348879, "p90": 0.05251200002476253, "mean": 0.051918000008299714, "iqr": 0.0011100000278929656, "raw_times": [0.05251200002476253, 0.05295199997590316, 0.05148200000348879, 0.05140199999686956, 0.05124200004047452], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05506200000127137, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05025200005093211, "p50": 0.05105200000343757, "p90": 0.05146199998762313, "mean": 0.05136380001431462, "iqr": 0.0005399999736255268, "raw_times": [0.05146199998762313, 0.053131000015582686, 0.050922000013997604, 0.05025200005093211, 0.05105200000343757], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0684330000240152, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051181999992877536, "p50": 0.052152000023397704, "p90": 0.05241200000227764, "mean": 0.05240600000888662, "iqr": 0.00034999999343199306, "raw_times": [0.052152000023397704, 0.05422200001703459, 0.05241200000227764, 0.051181999992877536, 0.052062000008845644], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05490099999860831, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05016099999011203, "p50": 0.05225199998903918, "p90": 0.05251199996791911, "mean": 0.05182779999586273, "iqr": 0.001349999934063817, "raw_times": [0.05016099999011203, 0.053051999998388055, 0.05116200003385529, 0.05251199996791911, 0.05225199998903918], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05627199999480581, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05154100000481776, "p50": 0.0524320000181433, "p90": 0.05299099996136647, "mean": 0.05266959998380116, "iqr": 0.0006189999908201571, "raw_times": [0.05154100000481776, 0.054011999964131974, 0.05299099996136647, 0.0524320000181433, 0.05237199997054631], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05572200001324745, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05098199994790775, "p50": 0.05128100002593783, "p90": 0.052071999959935056, "mean": 0.05161159999715892, "iqr": 0.0008409999168179638, "raw_times": [0.05098199994790775, 0.052071999959935056, 0.05128100002593783, 0.05123100004311709, 0.052492000008896866], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055401999986770534, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050202000011267955, "p50": 0.05295199997590316, "p90": 0.05307200001425372, "mean": 0.052619999996750266, "iqr": 0.00046000002384971594, "raw_times": [0.050202000011267955, 0.05307200001425372, 0.054261999991922494, 0.05295199997590316, 0.052611999990404], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05440200004613871, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05220100001679384, "p50": 0.052891999985149596, "p90": 0.05323199997064876, "mean": 0.05431980000594194, "iqr": 0.0007509999591093219, "raw_times": [0.05220100001679384, 0.052891999985149596, 0.05323199997064876, 0.06079300004557808, 0.052481000011539436], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0552820000052634, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05108200002723606, "p50": 0.05157200001804085, "p90": 0.053041000001030625, "mean": 0.051985800007514626, "iqr": 0.0018490000002202578, "raw_times": [0.05157200001804085, 0.05108200002723606, 0.053041000001030625, 0.05119200000081037, 0.053041999990455224], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05657200000541707, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05095099999152808, "p50": 0.0515919999770631, "p90": 0.05208099997844329, "mean": 0.05173159999003474, "iqr": 0.0006789999815737247, "raw_times": [0.0515919999770631, 0.05208099997844329, 0.052632000006269664, 0.05095099999152808, 0.05140199999686956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056392000033156364, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
- {"ts": "2025-10-29T00:36:50Z", "run": "aa946ffb43294efa951ff76a7615ec34", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05110099999683371, "p50": 0.051662000032592914, "p90": 0.051741999982368725, "mean": 0.05161380000799909, "iqr": 0.00010999997357430402, "raw_times": [0.05163200000879442, 0.05110099999683371, 0.051741999982368725, 0.051662000032592914, 0.05193200001940568], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05588200002648591, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047261000020171196, "p50": 0.04859200004148079, "p90": 0.0489899999820409, "mean": 0.048763200015855546, "iqr": 0.0006179999445521389, "raw_times": [0.050600999998096086, 0.0489899999820409, 0.04837200003748876, 0.04859200004148079, 0.047261000020171196], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06049099999927421, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053621000006387476, "p50": 0.05462100000386272, "p90": 0.05485100001578758, "mean": 0.054479000004903355, "iqr": 0.0006300000450210064, "raw_times": [0.05462100000386272, 0.053621000006387476, 0.05485100001578758, 0.055081000027712435, 0.05422099997076657], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05994100001771585, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052632000006269664, "p50": 0.054361000024982786, "p90": 0.05462100000386272, "mean": 0.05404320000934604, "iqr": 0.0009999999974752427, "raw_times": [0.054361000024982786, 0.05498100000522754, 0.053621000006387476, 0.05462100000386272, 0.052632000006269664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058602000024166045, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05227200000490484, "p50": 0.05285200001026169, "p90": 0.053781000019625935, "mean": 0.05329160001110722, "iqr": 0.0009899999895424116, "raw_times": [0.05285200001026169, 0.05227200000490484, 0.054761999990660115, 0.053781000019625935, 0.052791000030083524], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05603200003179154, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05200100002866748, "p50": 0.054180999995878665, "p90": 0.05433100000118429, "mean": 0.05350320001298314, "iqr": 0.0019299999962640868, "raw_times": [0.054602000034265075, 0.05433100000118429, 0.052401000004920206, 0.05200100002866748, 0.054180999995878665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05716200001870675, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05303100004994121, "p50": 0.05431100004216205, "p90": 0.05439099999193786, "mean": 0.053947000014886726, "iqr": 0.0011999999856016075, "raw_times": [0.05439099999193786, 0.05481099998405625, 0.05319100000633625, 0.05431100004216205, 0.05303100004994121], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05832099998315243, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05105100001401297, "p50": 0.05330099997991056, "p90": 0.05380099997864818, "mean": 0.054202999990593526, "iqr": 0.0006199999802447564, "raw_times": [0.05105100001401297, 0.05330099997991056, 0.05318099999840342, 0.0596809999819925, 0.05380099997864818], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056061000009322015, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05164100002730265, "p50": 0.052870999979859334, "p90": 0.05319100000633625, "mean": 0.05809520000639168, "iqr": 0.0004199999921183917, "raw_times": [0.05277100001421786, 0.052870999979859334, 0.05319100000633625, 0.05164100002730265, 0.08000200000424229], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05684100000280523, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050990999966415984, "p50": 0.05245100004458436, "p90": 0.05260099999304657, "mean": 0.05224100000305043, "iqr": 0.0006000000212225132, "raw_times": [0.050990999966415984, 0.05316100003938118, 0.05200099997182406, 0.05260099999304657, 0.05245100004458436], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05579200001193385, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05112100001269937, "p50": 0.0527509999983522, "p90": 0.053382000032797805, "mean": 0.05273720000786852, "iqr": 0.0022010000293448684, "raw_times": [0.05112100001269937, 0.0527509999983522, 0.055250999992040306, 0.053382000032797805, 0.05118100000345294], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05559099997753947, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051831000007496186, "p50": 0.05281099998910577, "p90": 0.0528209999970386, "mean": 0.05249119999461982, "iqr": 0.0007599999776175537, "raw_times": [0.0528209999970386, 0.052061000019421044, 0.051831000007496186, 0.05281099998910577, 0.0529319999600375], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05652100003317173, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05226100000754741, "p50": 0.053051999998388055, "p90": 0.053290999971977726, "mean": 0.05317119998835551, "iqr": 0.00048099997229655855, "raw_times": [0.05226100000754741, 0.054441999964183196, 0.053051999998388055, 0.05280999999968117, 0.053290999971977726], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05758100002140054, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05227100001548024, "p50": 0.05277099995737444, "p90": 0.05359099998258898, "mean": 0.053112999989934906, "iqr": 0.0010999999631167157, "raw_times": [0.05227100001548024, 0.0544409999747586, 0.05359099998258898, 0.05249100001947227, 0.05277099995737444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05578100001457642, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051241999983631104, "p50": 0.052721000031397125, "p90": 0.05347100000108185, "mean": 0.05272120000654468, "iqr": 0.0009499999578110874, "raw_times": [0.05252100004327076, 0.052721000031397125, 0.05347100000108185, 0.05365099997334255, 0.051241999983631104], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07047099995816097, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0512010000193186, "p50": 0.052960999994411395, "p90": 0.05432099999325146, "mean": 0.05357720000347399, "iqr": 0.0024799999778224446, "raw_times": [0.0512010000193186, 0.05756199999495948, 0.05184100001542902, 0.05432099999325146, 0.052960999994411395], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05626099999744838, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05130099998496007, "p50": 0.05301099997723213, "p90": 0.05393200001435616, "mean": 0.05291720000286659, "iqr": 0.0015410000173687877, "raw_times": [0.053951000040797226, 0.05393200001435616, 0.05130099998496007, 0.05301099997723213, 0.052390999996987375], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056620999998813204, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052390999996987375, "p50": 0.052880999987792165, "p90": 0.05298999997194187, "mean": 0.05283679998910884, "iqr": 0.00045899997758169775, "raw_times": [0.05253099999436017, 0.05339099999446262, 0.052390999996987375, 0.052880999987792165, 0.05298999997194187], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057541000046512636, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05287200002612735, "p50": 0.05326199999444725, "p90": 0.05488099998274265, "mean": 0.05554340000344382, "iqr": 0.0019099999803984247, "raw_times": [0.05287200002612735, 0.05326199999444725, 0.052971000002344226, 0.06373100001155763, 0.05488099998274265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05706100000679726, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05237200002738973, "p50": 0.05340200004866347, "p90": 0.054441999964183196, "mean": 0.05358960000876323, "iqr": 0.0017409999486517336, "raw_times": [0.05237200002738973, 0.05503099998804828, 0.054441999964183196, 0.05340200004866347, 0.05270100001553146], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056061000009322015, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05089100000077451, "p50": 0.054600999987997056, "p90": 0.054670999986683455, "mean": 0.05481719998670087, "iqr": 0.001720000000204891, "raw_times": [0.05089100000077451, 0.054670999986683455, 0.054600999987997056, 0.06097199997157077, 0.052950999986478564], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05628200000273864, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052641000024777895, "p50": 0.05274100004726279, "p90": 0.05537099997354744, "mean": 0.05482900000970403, "iqr": 0.002670000014859397, "raw_times": [0.05274100004726279, 0.052700999958688044, 0.05537099997354744, 0.06069100004424399, 0.052641000024777895], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056130999951164995, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05228100002341307, "p50": 0.05332099999577622, "p90": 0.053600999990521814, "mean": 0.05359900000030393, "iqr": 0.0007799999934832158, "raw_times": [0.05228100002341307, 0.05332099999577622, 0.0528209999970386, 0.055970999994769954, 0.053600999990521814], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05611099999214275, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0509510000483715, "p50": 0.05250099997056168, "p90": 0.05270100001553146, "mean": 0.052225199999611505, "iqr": 0.000470000031782547, "raw_times": [0.05270100001553146, 0.0509510000483715, 0.052230999983748916, 0.05250099997056168, 0.05274199997984397], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05471000002899018, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
+ {"ts": "2025-10-29T04:14:45Z", "run": "83e7b9ef6882434ca7574bcbc698d6fc", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05066099998884965, "p50": 0.05315100003144835, "p90": 0.05323099998122416, "mean": 0.052780999999413325, "iqr": 0.0009899999895424116, "raw_times": [0.05224099999168175, 0.05462100000386272, 0.05323099998122416, 0.05315100003144835, 0.05066099998884965], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05779099996061632, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
causal_conv1d/impls/hf_kernels_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/impls/torch_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/results/artifacts/combine/latency.svg CHANGED
causal_conv1d/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T00:37:16.145885</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
- <path d="M 47.72 375.520266 L 831.034248 375.520266 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.520266" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.319485" transform="rotate(-0 40.72 379.319485)">0.1</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
- <path d="M 47.72 292.387127 L 831.034248 292.387127 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.387127" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.186346" transform="rotate(-0 40.72 296.186346)">0.2</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
- <path d="M 47.72 209.253989 L 831.034248 209.253989 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
- <use ns4:href="#m0fca2865ba" x="47.72" y="209.253989" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.053207" transform="rotate(-0 40.72 213.053207)">0.3</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
- <path d="M 47.72 126.12085 L 831.034248 126.12085 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.12085" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.920068" transform="rotate(-0 40.72 129.920068)">0.4</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
- <path d="M 47.72 42.987711 L 831.034248 42.987711 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
- <use ns4:href="#m0fca2865ba" x="47.72" y="42.987711" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.786929" transform="rotate(-0 40.72 46.786929)">0.5</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
@@ -4287,66 +4287,66 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4287
  </g>
4288
  </g>
4289
  <g id="series--hf-kernels-causal-conv1d" class="series">
4290
- <path d="M 83.325193 420.186871 L 114.286231 414.208767 L 145.247268 413.177916 L 176.208306 414.31684 L 207.169343 414.516359 L 238.130381 415.513957 L 269.091418 415.42251 L 300.052455 414.06744 L 331.013493 416.32118 L 361.97453 415.131544 L 392.935568 415.065038 L 423.896605 415.971189 L 454.857643 415.62203 L 485.81868 415.854803 L 516.779718 416.212275 L 547.740755 415.297811 L 578.701793 415.214678 L 609.66283 415.065038 L 640.623868 416.0219 L 671.584905 414.632746 L 702.545943 414.682625 L 733.50698 415.779983 L 764.468018 415.763356 L 795.429055 415.705163 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#pb49fc4c8d2)">
4295
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4296
- <use ns4:href="#md7efaf3aec" x="114.286231" y="414.208767" style="fill: #1f77b4; stroke: #1f77b4" />
4297
- <use ns4:href="#md7efaf3aec" x="145.247268" y="413.177916" style="fill: #1f77b4; stroke: #1f77b4" />
4298
- <use ns4:href="#md7efaf3aec" x="176.208306" y="414.31684" style="fill: #1f77b4; stroke: #1f77b4" />
4299
- <use ns4:href="#md7efaf3aec" x="207.169343" y="414.516359" style="fill: #1f77b4; stroke: #1f77b4" />
4300
- <use ns4:href="#md7efaf3aec" x="238.130381" y="415.513957" style="fill: #1f77b4; stroke: #1f77b4" />
4301
- <use ns4:href="#md7efaf3aec" x="269.091418" y="415.42251" style="fill: #1f77b4; stroke: #1f77b4" />
4302
- <use ns4:href="#md7efaf3aec" x="300.052455" y="414.06744" style="fill: #1f77b4; stroke: #1f77b4" />
4303
- <use ns4:href="#md7efaf3aec" x="331.013493" y="416.32118" style="fill: #1f77b4; stroke: #1f77b4" />
4304
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.131544" style="fill: #1f77b4; stroke: #1f77b4" />
4305
- <use ns4:href="#md7efaf3aec" x="392.935568" y="415.065038" style="fill: #1f77b4; stroke: #1f77b4" />
4306
- <use ns4:href="#md7efaf3aec" x="423.896605" y="415.971189" style="fill: #1f77b4; stroke: #1f77b4" />
4307
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.62203" style="fill: #1f77b4; stroke: #1f77b4" />
4308
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.854803" style="fill: #1f77b4; stroke: #1f77b4" />
4309
- <use ns4:href="#md7efaf3aec" x="516.779718" y="416.212275" style="fill: #1f77b4; stroke: #1f77b4" />
4310
- <use ns4:href="#md7efaf3aec" x="547.740755" y="415.297811" style="fill: #1f77b4; stroke: #1f77b4" />
4311
- <use ns4:href="#md7efaf3aec" x="578.701793" y="415.214678" style="fill: #1f77b4; stroke: #1f77b4" />
4312
- <use ns4:href="#md7efaf3aec" x="609.66283" y="415.065038" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="640.623868" y="416.0219" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="671.584905" y="414.632746" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.682625" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="733.50698" y="415.779983" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="764.468018" y="415.763356" style="fill: #1f77b4; stroke: #1f77b4" />
4318
- <use ns4:href="#md7efaf3aec" x="795.429055" y="415.705163" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
4321
  <g id="series--torch-eager" class="series">
4322
- <path d="M 83.325193 397.997804 L 114.286231 385.24435 L 145.247268 387.148098 L 176.208306 387.347618 L 207.169343 387.506402 L 238.130381 388.262083 L 269.091418 389.924745 L 300.052455 387.513884 L 331.013493 388.586302 L 361.97453 388.013515 L 392.935568 336.751127 L 423.896605 324.438278 L 454.857643 388.304481 L 485.81868 389.766792 L 516.779718 388.677748 L 547.740755 388.228829 L 578.701793 388.145696 L 609.66283 386.632673 L 640.623868 387.87967 L 671.584905 388.453289 L 702.545943 380.788413 L 733.50698 375.608387 L 764.468018 52.783289 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4323
  <defs>
4324
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4325
  </defs>
4326
  <g clip-path="url(#pb49fc4c8d2)">
4327
- <use ns4:href="#m9b8c54d372" x="83.325193" y="397.997804" style="fill: #ff7f0e; stroke: #ff7f0e" />
4328
- <use ns4:href="#m9b8c54d372" x="114.286231" y="385.24435" style="fill: #ff7f0e; stroke: #ff7f0e" />
4329
- <use ns4:href="#m9b8c54d372" x="145.247268" y="387.148098" style="fill: #ff7f0e; stroke: #ff7f0e" />
4330
- <use ns4:href="#m9b8c54d372" x="176.208306" y="387.347618" style="fill: #ff7f0e; stroke: #ff7f0e" />
4331
- <use ns4:href="#m9b8c54d372" x="207.169343" y="387.506402" style="fill: #ff7f0e; stroke: #ff7f0e" />
4332
- <use ns4:href="#m9b8c54d372" x="238.130381" y="388.262083" style="fill: #ff7f0e; stroke: #ff7f0e" />
4333
- <use ns4:href="#m9b8c54d372" x="269.091418" y="389.924745" style="fill: #ff7f0e; stroke: #ff7f0e" />
4334
- <use ns4:href="#m9b8c54d372" x="300.052455" y="387.513884" style="fill: #ff7f0e; stroke: #ff7f0e" />
4335
- <use ns4:href="#m9b8c54d372" x="331.013493" y="388.586302" style="fill: #ff7f0e; stroke: #ff7f0e" />
4336
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.013515" style="fill: #ff7f0e; stroke: #ff7f0e" />
4337
- <use ns4:href="#m9b8c54d372" x="392.935568" y="336.751127" style="fill: #ff7f0e; stroke: #ff7f0e" />
4338
- <use ns4:href="#m9b8c54d372" x="423.896605" y="324.438278" style="fill: #ff7f0e; stroke: #ff7f0e" />
4339
- <use ns4:href="#m9b8c54d372" x="454.857643" y="388.304481" style="fill: #ff7f0e; stroke: #ff7f0e" />
4340
- <use ns4:href="#m9b8c54d372" x="485.81868" y="389.766792" style="fill: #ff7f0e; stroke: #ff7f0e" />
4341
- <use ns4:href="#m9b8c54d372" x="516.779718" y="388.677748" style="fill: #ff7f0e; stroke: #ff7f0e" />
4342
- <use ns4:href="#m9b8c54d372" x="547.740755" y="388.228829" style="fill: #ff7f0e; stroke: #ff7f0e" />
4343
- <use ns4:href="#m9b8c54d372" x="578.701793" y="388.145696" style="fill: #ff7f0e; stroke: #ff7f0e" />
4344
- <use ns4:href="#m9b8c54d372" x="609.66283" y="386.632673" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
- <use ns4:href="#m9b8c54d372" x="640.623868" y="387.87967" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
- <use ns4:href="#m9b8c54d372" x="671.584905" y="388.453289" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.788413" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.608387" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
- <use ns4:href="#m9b8c54d372" x="764.468018" y="52.783289" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
  </g>
4352
  </g>
@@ -4405,7 +4405,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4405
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4406
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4407
  </span> |
4408
- Cell: combine | 4.43s
4409
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4410
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4411
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4516,30 +4516,30 @@ hf_kernels_causal_conv1d cuda_B4_D64_S2048_W2 0.05 True
4516
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
4517
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
4518
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
4519
- torch_eager cuda_B2_D2048_S128_W2 0.08 True
4520
  torch_eager cuda_B2_D2048_S128_W4 0.09 True
4521
- torch_eager cuda_B2_D2048_S2048_W2 0.15 True
4522
  torch_eager cuda_B2_D2048_S2048_W4 0.16 True
4523
- torch_eager cuda_B2_D2048_S512_W2 0.08 True
4524
- torch_eager cuda_B2_D2048_S512_W4 0.08 True
4525
  torch_eager cuda_B2_D64_S128_W2 0.07 True
4526
  torch_eager cuda_B2_D64_S128_W4 0.09 True
4527
  torch_eager cuda_B2_D64_S2048_W2 0.09 True
4528
- torch_eager cuda_B2_D64_S2048_W4 0.08 True
4529
  torch_eager cuda_B2_D64_S512_W2 0.09 True
4530
  torch_eager cuda_B2_D64_S512_W4 0.09 True
4531
  torch_eager cuda_B4_D2048_S128_W2 0.09 True
4532
- torch_eager cuda_B4_D2048_S128_W4 0.08 True
4533
  torch_eager cuda_B4_D2048_S2048_W2 0.49 True
4534
  torch_eager cuda_B4_D2048_S2048_W4 0.50 True
4535
- torch_eager cuda_B4_D2048_S512_W2 0.09 True
4536
  torch_eager cuda_B4_D2048_S512_W4 0.10 True
4537
- torch_eager cuda_B4_D64_S128_W2 0.08 True
4538
- torch_eager cuda_B4_D64_S128_W4 0.08 True
4539
- torch_eager cuda_B4_D64_S2048_W2 0.08 True
4540
  torch_eager cuda_B4_D64_S2048_W4 0.09 True
4541
- torch_eager cuda_B4_D64_S512_W2 0.08 True
4542
- torch_eager cuda_B4_D64_S512_W4 0.08 True
4543
 
4544
  GENERATING COMBINED VISUALIZATION
4545
 
@@ -4559,7 +4559,7 @@ Implementations included:
4559
  <div class="uv-install-logs" id="uv-logs-combine">
4560
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4561
  <div class="uv-logs-content" style="display: none;">
4562
- Installed 37 packages in 239ms
4563
  </div>
4564
  </div>
4565
  <div class="cell-artifacts">
@@ -4572,7 +4572,7 @@ Installed 37 packages in 239ms
4572
  <rdf:RDF>
4573
  <ns2:Work>
4574
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4575
- <dc:date>2025-10-29T00:37:16.145885</dc:date>
4576
  <dc:format>image/svg+xml</dc:format>
4577
  <dc:creator>
4578
  <ns2:Agent>
@@ -4916,70 +4916,70 @@ Installed 37 packages in 239ms
4916
  <g id="matplotlib.axis_2">
4917
  <g id="ytick_1">
4918
  <g id="grid-y--2" class="grid grid-y">
4919
- <path d="M 47.72 375.520266 L 831.034248 375.520266 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4920
  </g>
4921
  <g id="line2d_25">
4922
  <defs>
4923
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4924
  </defs>
4925
  <g>
4926
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.520266" style="stroke: #000000; stroke-width: 0.8" />
4927
  </g>
4928
  </g>
4929
  <g id="text_25">
4930
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.319485" transform="rotate(-0 40.72 379.319485)">0.1</text>
4931
  </g>
4932
  </g>
4933
  <g id="ytick_2">
4934
  <g id="grid-y--3" class="grid grid-y">
4935
- <path d="M 47.72 292.387127 L 831.034248 292.387127 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4936
  </g>
4937
  <g id="line2d_26">
4938
  <g>
4939
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.387127" style="stroke: #000000; stroke-width: 0.8" />
4940
  </g>
4941
  </g>
4942
  <g id="text_26">
4943
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.186346" transform="rotate(-0 40.72 296.186346)">0.2</text>
4944
  </g>
4945
  </g>
4946
  <g id="ytick_3">
4947
  <g id="grid-y--4" class="grid grid-y">
4948
- <path d="M 47.72 209.253989 L 831.034248 209.253989 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4949
  </g>
4950
  <g id="line2d_27">
4951
  <g>
4952
- <use ns4:href="#m0fca2865ba" x="47.72" y="209.253989" style="stroke: #000000; stroke-width: 0.8" />
4953
  </g>
4954
  </g>
4955
  <g id="text_27">
4956
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.053207" transform="rotate(-0 40.72 213.053207)">0.3</text>
4957
  </g>
4958
  </g>
4959
  <g id="ytick_4">
4960
  <g id="grid-y--5" class="grid grid-y">
4961
- <path d="M 47.72 126.12085 L 831.034248 126.12085 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4962
  </g>
4963
  <g id="line2d_28">
4964
  <g>
4965
- <use ns4:href="#m0fca2865ba" x="47.72" y="126.12085" style="stroke: #000000; stroke-width: 0.8" />
4966
  </g>
4967
  </g>
4968
  <g id="text_28">
4969
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.920068" transform="rotate(-0 40.72 129.920068)">0.4</text>
4970
  </g>
4971
  </g>
4972
  <g id="ytick_5">
4973
  <g id="grid-y--6" class="grid grid-y">
4974
- <path d="M 47.72 42.987711 L 831.034248 42.987711 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4975
  </g>
4976
  <g id="line2d_29">
4977
  <g>
4978
- <use ns4:href="#m0fca2865ba" x="47.72" y="42.987711" style="stroke: #000000; stroke-width: 0.8" />
4979
  </g>
4980
  </g>
4981
  <g id="text_29">
4982
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="46.786929" transform="rotate(-0 40.72 46.786929)">0.5</text>
4983
  </g>
4984
  </g>
4985
  <g id="label--y" class="ylabel">
@@ -4987,66 +4987,66 @@ Installed 37 packages in 239ms
4987
  </g>
4988
  </g>
4989
  <g id="series--hf-kernels-causal-conv1d" class="series">
4990
- <path d="M 83.325193 420.186871 L 114.286231 414.208767 L 145.247268 413.177916 L 176.208306 414.31684 L 207.169343 414.516359 L 238.130381 415.513957 L 269.091418 415.42251 L 300.052455 414.06744 L 331.013493 416.32118 L 361.97453 415.131544 L 392.935568 415.065038 L 423.896605 415.971189 L 454.857643 415.62203 L 485.81868 415.854803 L 516.779718 416.212275 L 547.740755 415.297811 L 578.701793 415.214678 L 609.66283 415.065038 L 640.623868 416.0219 L 671.584905 414.632746 L 702.545943 414.682625 L 733.50698 415.779983 L 764.468018 415.763356 L 795.429055 415.705163 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4991
  <defs>
4992
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4993
  </defs>
4994
  <g clip-path="url(#pb49fc4c8d2)">
4995
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4996
- <use ns4:href="#md7efaf3aec" x="114.286231" y="414.208767" style="fill: #1f77b4; stroke: #1f77b4" />
4997
- <use ns4:href="#md7efaf3aec" x="145.247268" y="413.177916" style="fill: #1f77b4; stroke: #1f77b4" />
4998
- <use ns4:href="#md7efaf3aec" x="176.208306" y="414.31684" style="fill: #1f77b4; stroke: #1f77b4" />
4999
- <use ns4:href="#md7efaf3aec" x="207.169343" y="414.516359" style="fill: #1f77b4; stroke: #1f77b4" />
5000
- <use ns4:href="#md7efaf3aec" x="238.130381" y="415.513957" style="fill: #1f77b4; stroke: #1f77b4" />
5001
- <use ns4:href="#md7efaf3aec" x="269.091418" y="415.42251" style="fill: #1f77b4; stroke: #1f77b4" />
5002
- <use ns4:href="#md7efaf3aec" x="300.052455" y="414.06744" style="fill: #1f77b4; stroke: #1f77b4" />
5003
- <use ns4:href="#md7efaf3aec" x="331.013493" y="416.32118" style="fill: #1f77b4; stroke: #1f77b4" />
5004
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.131544" style="fill: #1f77b4; stroke: #1f77b4" />
5005
- <use ns4:href="#md7efaf3aec" x="392.935568" y="415.065038" style="fill: #1f77b4; stroke: #1f77b4" />
5006
- <use ns4:href="#md7efaf3aec" x="423.896605" y="415.971189" style="fill: #1f77b4; stroke: #1f77b4" />
5007
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.62203" style="fill: #1f77b4; stroke: #1f77b4" />
5008
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.854803" style="fill: #1f77b4; stroke: #1f77b4" />
5009
- <use ns4:href="#md7efaf3aec" x="516.779718" y="416.212275" style="fill: #1f77b4; stroke: #1f77b4" />
5010
- <use ns4:href="#md7efaf3aec" x="547.740755" y="415.297811" style="fill: #1f77b4; stroke: #1f77b4" />
5011
- <use ns4:href="#md7efaf3aec" x="578.701793" y="415.214678" style="fill: #1f77b4; stroke: #1f77b4" />
5012
- <use ns4:href="#md7efaf3aec" x="609.66283" y="415.065038" style="fill: #1f77b4; stroke: #1f77b4" />
5013
- <use ns4:href="#md7efaf3aec" x="640.623868" y="416.0219" style="fill: #1f77b4; stroke: #1f77b4" />
5014
- <use ns4:href="#md7efaf3aec" x="671.584905" y="414.632746" style="fill: #1f77b4; stroke: #1f77b4" />
5015
- <use ns4:href="#md7efaf3aec" x="702.545943" y="414.682625" style="fill: #1f77b4; stroke: #1f77b4" />
5016
- <use ns4:href="#md7efaf3aec" x="733.50698" y="415.779983" style="fill: #1f77b4; stroke: #1f77b4" />
5017
- <use ns4:href="#md7efaf3aec" x="764.468018" y="415.763356" style="fill: #1f77b4; stroke: #1f77b4" />
5018
- <use ns4:href="#md7efaf3aec" x="795.429055" y="415.705163" style="fill: #1f77b4; stroke: #1f77b4" />
5019
  </g>
5020
  </g>
5021
  <g id="series--torch-eager" class="series">
5022
- <path d="M 83.325193 397.997804 L 114.286231 385.24435 L 145.247268 387.148098 L 176.208306 387.347618 L 207.169343 387.506402 L 238.130381 388.262083 L 269.091418 389.924745 L 300.052455 387.513884 L 331.013493 388.586302 L 361.97453 388.013515 L 392.935568 336.751127 L 423.896605 324.438278 L 454.857643 388.304481 L 485.81868 389.766792 L 516.779718 388.677748 L 547.740755 388.228829 L 578.701793 388.145696 L 609.66283 386.632673 L 640.623868 387.87967 L 671.584905 388.453289 L 702.545943 380.788413 L 733.50698 375.608387 L 764.468018 52.783289 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5023
  <defs>
5024
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5025
  </defs>
5026
  <g clip-path="url(#pb49fc4c8d2)">
5027
- <use ns4:href="#m9b8c54d372" x="83.325193" y="397.997804" style="fill: #ff7f0e; stroke: #ff7f0e" />
5028
- <use ns4:href="#m9b8c54d372" x="114.286231" y="385.24435" style="fill: #ff7f0e; stroke: #ff7f0e" />
5029
- <use ns4:href="#m9b8c54d372" x="145.247268" y="387.148098" style="fill: #ff7f0e; stroke: #ff7f0e" />
5030
- <use ns4:href="#m9b8c54d372" x="176.208306" y="387.347618" style="fill: #ff7f0e; stroke: #ff7f0e" />
5031
- <use ns4:href="#m9b8c54d372" x="207.169343" y="387.506402" style="fill: #ff7f0e; stroke: #ff7f0e" />
5032
- <use ns4:href="#m9b8c54d372" x="238.130381" y="388.262083" style="fill: #ff7f0e; stroke: #ff7f0e" />
5033
- <use ns4:href="#m9b8c54d372" x="269.091418" y="389.924745" style="fill: #ff7f0e; stroke: #ff7f0e" />
5034
- <use ns4:href="#m9b8c54d372" x="300.052455" y="387.513884" style="fill: #ff7f0e; stroke: #ff7f0e" />
5035
- <use ns4:href="#m9b8c54d372" x="331.013493" y="388.586302" style="fill: #ff7f0e; stroke: #ff7f0e" />
5036
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.013515" style="fill: #ff7f0e; stroke: #ff7f0e" />
5037
- <use ns4:href="#m9b8c54d372" x="392.935568" y="336.751127" style="fill: #ff7f0e; stroke: #ff7f0e" />
5038
- <use ns4:href="#m9b8c54d372" x="423.896605" y="324.438278" style="fill: #ff7f0e; stroke: #ff7f0e" />
5039
- <use ns4:href="#m9b8c54d372" x="454.857643" y="388.304481" style="fill: #ff7f0e; stroke: #ff7f0e" />
5040
- <use ns4:href="#m9b8c54d372" x="485.81868" y="389.766792" style="fill: #ff7f0e; stroke: #ff7f0e" />
5041
- <use ns4:href="#m9b8c54d372" x="516.779718" y="388.677748" style="fill: #ff7f0e; stroke: #ff7f0e" />
5042
- <use ns4:href="#m9b8c54d372" x="547.740755" y="388.228829" style="fill: #ff7f0e; stroke: #ff7f0e" />
5043
- <use ns4:href="#m9b8c54d372" x="578.701793" y="388.145696" style="fill: #ff7f0e; stroke: #ff7f0e" />
5044
- <use ns4:href="#m9b8c54d372" x="609.66283" y="386.632673" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
- <use ns4:href="#m9b8c54d372" x="640.623868" y="387.87967" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
- <use ns4:href="#m9b8c54d372" x="671.584905" y="388.453289" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.788413" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.608387" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
- <use ns4:href="#m9b8c54d372" x="764.468018" y="52.783289" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
  </g>
5052
  </g>
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T04:15:07.150955</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
+ <path d="M 47.72 377.128985 L 831.034248 377.128985 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
+ <use ns4:href="#m0fca2865ba" x="47.72" y="377.128985" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.928204" transform="rotate(-0 40.72 380.928204)">0.1</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
+ <path d="M 47.72 293.371817 L 831.034248 293.371817 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
+ <use ns4:href="#m0fca2865ba" x="47.72" y="293.371817" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.171035" transform="rotate(-0 40.72 297.171035)">0.2</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
+ <path d="M 47.72 209.614648 L 831.034248 209.614648 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
+ <use ns4:href="#m0fca2865ba" x="47.72" y="209.614648" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.413867" transform="rotate(-0 40.72 213.413867)">0.3</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
+ <path d="M 47.72 125.857479 L 831.034248 125.857479 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
+ <use ns4:href="#m0fca2865ba" x="47.72" y="125.857479" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.656698" transform="rotate(-0 40.72 129.656698)">0.4</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
+ <path d="M 47.72 42.100311 L 831.034248 42.100311 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
+ <use ns4:href="#m0fca2865ba" x="47.72" y="42.100311" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="45.89953" transform="rotate(-0 40.72 45.89953)">0.5</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
 
4287
  </g>
4288
  </g>
4289
  <g id="series--hf-kernels-causal-conv1d" class="series">
4290
+ <path d="M 83.325193 420.186871 L 114.286231 415.137151 L 145.247268 415.35492 L 176.208306 416.618815 L 207.169343 415.505682 L 238.130381 415.396798 L 269.091418 416.242746 L 300.052455 416.602901 L 331.013493 416.954681 L 361.97453 416.70341 L 392.935568 416.653156 L 423.896605 416.451301 L 454.857643 416.686659 L 485.81868 416.728537 L 516.779718 416.52752 L 547.740755 416.485641 L 578.701793 416.594526 L 609.66283 416.275411 L 640.623868 416.158151 L 671.584905 415.153902 L 702.545943 416.711786 L 733.50698 416.225994 L 764.468018 416.912803 L 795.429055 416.368381 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#pb49fc4c8d2)">
4295
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4296
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="415.137151" style="fill: #1f77b4; stroke: #1f77b4" />
4297
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="415.35492" style="fill: #1f77b4; stroke: #1f77b4" />
4298
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="416.618815" style="fill: #1f77b4; stroke: #1f77b4" />
4299
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="415.505682" style="fill: #1f77b4; stroke: #1f77b4" />
4300
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="415.396798" style="fill: #1f77b4; stroke: #1f77b4" />
4301
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="416.242746" style="fill: #1f77b4; stroke: #1f77b4" />
4302
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="416.602901" style="fill: #1f77b4; stroke: #1f77b4" />
4303
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="416.954681" style="fill: #1f77b4; stroke: #1f77b4" />
4304
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="416.70341" style="fill: #1f77b4; stroke: #1f77b4" />
4305
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="416.653156" style="fill: #1f77b4; stroke: #1f77b4" />
4306
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="416.451301" style="fill: #1f77b4; stroke: #1f77b4" />
4307
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="416.686659" style="fill: #1f77b4; stroke: #1f77b4" />
4308
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="416.728537" style="fill: #1f77b4; stroke: #1f77b4" />
4309
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="416.52752" style="fill: #1f77b4; stroke: #1f77b4" />
4310
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="416.485641" style="fill: #1f77b4; stroke: #1f77b4" />
4311
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="416.594526" style="fill: #1f77b4; stroke: #1f77b4" />
4312
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="416.275411" style="fill: #1f77b4; stroke: #1f77b4" />
4313
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="416.158151" style="fill: #1f77b4; stroke: #1f77b4" />
4314
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="415.153902" style="fill: #1f77b4; stroke: #1f77b4" />
4315
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="416.711786" style="fill: #1f77b4; stroke: #1f77b4" />
4316
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="416.225994" style="fill: #1f77b4; stroke: #1f77b4" />
4317
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="416.912803" style="fill: #1f77b4; stroke: #1f77b4" />
4318
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="416.368381" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
4321
  <g id="series--torch-eager" class="series">
4322
+ <path d="M 83.325193 400.311295 L 114.286231 385.754299 L 145.247268 385.905062 L 176.208306 386.851518 L 207.169343 386.441107 L 238.130381 387.194922 L 269.091418 387.194922 L 300.052455 387.161419 L 331.013493 387.211673 L 361.97453 387.647211 L 392.935568 339.58651 L 423.896605 324.619104 L 454.857643 388.040869 L 485.81868 388.342395 L 516.779718 387.915234 L 547.740755 387.806349 L 578.701793 387.086038 L 609.66283 387.186546 L 640.623868 387.429442 L 671.584905 388.216759 L 702.545943 379.824291 L 733.50698 375.36841 L 764.468018 53.322934 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4323
  <defs>
4324
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4325
  </defs>
4326
  <g clip-path="url(#pb49fc4c8d2)">
4327
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="400.311295" style="fill: #ff7f0e; stroke: #ff7f0e" />
4328
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="385.754299" style="fill: #ff7f0e; stroke: #ff7f0e" />
4329
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="385.905062" style="fill: #ff7f0e; stroke: #ff7f0e" />
4330
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="386.851518" style="fill: #ff7f0e; stroke: #ff7f0e" />
4331
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="386.441107" style="fill: #ff7f0e; stroke: #ff7f0e" />
4332
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="387.194922" style="fill: #ff7f0e; stroke: #ff7f0e" />
4333
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="387.194922" style="fill: #ff7f0e; stroke: #ff7f0e" />
4334
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="387.161419" style="fill: #ff7f0e; stroke: #ff7f0e" />
4335
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="387.211673" style="fill: #ff7f0e; stroke: #ff7f0e" />
4336
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="387.647211" style="fill: #ff7f0e; stroke: #ff7f0e" />
4337
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="339.58651" style="fill: #ff7f0e; stroke: #ff7f0e" />
4338
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="324.619104" style="fill: #ff7f0e; stroke: #ff7f0e" />
4339
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="388.040869" style="fill: #ff7f0e; stroke: #ff7f0e" />
4340
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="388.342395" style="fill: #ff7f0e; stroke: #ff7f0e" />
4341
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="387.915234" style="fill: #ff7f0e; stroke: #ff7f0e" />
4342
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="387.806349" style="fill: #ff7f0e; stroke: #ff7f0e" />
4343
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="387.086038" style="fill: #ff7f0e; stroke: #ff7f0e" />
4344
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="387.186546" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="387.429442" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="388.216759" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="379.824291" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="375.36841" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="53.322934" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
  </g>
4352
  </g>
 
4405
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4406
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4407
  </span> |
4408
+ Cell: combine | 4.37s
4409
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4410
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4411
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4516
  hf_kernels_causal_conv1d cuda_B4_D64_S2048_W4 0.05 True
4517
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W2 0.05 True
4518
  hf_kernels_causal_conv1d cuda_B4_D64_S512_W4 0.05 True
4519
+ torch_eager cuda_B2_D2048_S128_W2 0.09 True
4520
  torch_eager cuda_B2_D2048_S128_W4 0.09 True
4521
+ torch_eager cuda_B2_D2048_S2048_W2 0.14 True
4522
  torch_eager cuda_B2_D2048_S2048_W4 0.16 True
4523
+ torch_eager cuda_B2_D2048_S512_W2 0.09 True
4524
+ torch_eager cuda_B2_D2048_S512_W4 0.09 True
4525
  torch_eager cuda_B2_D64_S128_W2 0.07 True
4526
  torch_eager cuda_B2_D64_S128_W4 0.09 True
4527
  torch_eager cuda_B2_D64_S2048_W2 0.09 True
4528
+ torch_eager cuda_B2_D64_S2048_W4 0.09 True
4529
  torch_eager cuda_B2_D64_S512_W2 0.09 True
4530
  torch_eager cuda_B2_D64_S512_W4 0.09 True
4531
  torch_eager cuda_B4_D2048_S128_W2 0.09 True
4532
+ torch_eager cuda_B4_D2048_S128_W4 0.09 True
4533
  torch_eager cuda_B4_D2048_S2048_W2 0.49 True
4534
  torch_eager cuda_B4_D2048_S2048_W4 0.50 True
4535
+ torch_eager cuda_B4_D2048_S512_W2 0.10 True
4536
  torch_eager cuda_B4_D2048_S512_W4 0.10 True
4537
+ torch_eager cuda_B4_D64_S128_W2 0.09 True
4538
+ torch_eager cuda_B4_D64_S128_W4 0.09 True
4539
+ torch_eager cuda_B4_D64_S2048_W2 0.09 True
4540
  torch_eager cuda_B4_D64_S2048_W4 0.09 True
4541
+ torch_eager cuda_B4_D64_S512_W2 0.09 True
4542
+ torch_eager cuda_B4_D64_S512_W4 0.09 True
4543
 
4544
  GENERATING COMBINED VISUALIZATION
4545
 
 
4559
  <div class="uv-install-logs" id="uv-logs-combine">
4560
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4561
  <div class="uv-logs-content" style="display: none;">
4562
+ Installed 37 packages in 213ms
4563
  </div>
4564
  </div>
4565
  <div class="cell-artifacts">
 
4572
  <rdf:RDF>
4573
  <ns2:Work>
4574
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4575
+ <dc:date>2025-10-29T04:15:07.150955</dc:date>
4576
  <dc:format>image/svg+xml</dc:format>
4577
  <dc:creator>
4578
  <ns2:Agent>
 
4916
  <g id="matplotlib.axis_2">
4917
  <g id="ytick_1">
4918
  <g id="grid-y--2" class="grid grid-y">
4919
+ <path d="M 47.72 377.128985 L 831.034248 377.128985 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4920
  </g>
4921
  <g id="line2d_25">
4922
  <defs>
4923
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4924
  </defs>
4925
  <g>
4926
+ <use ns4:href="#m0fca2865ba" x="47.72" y="377.128985" style="stroke: #000000; stroke-width: 0.8" />
4927
  </g>
4928
  </g>
4929
  <g id="text_25">
4930
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="380.928204" transform="rotate(-0 40.72 380.928204)">0.1</text>
4931
  </g>
4932
  </g>
4933
  <g id="ytick_2">
4934
  <g id="grid-y--3" class="grid grid-y">
4935
+ <path d="M 47.72 293.371817 L 831.034248 293.371817 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4936
  </g>
4937
  <g id="line2d_26">
4938
  <g>
4939
+ <use ns4:href="#m0fca2865ba" x="47.72" y="293.371817" style="stroke: #000000; stroke-width: 0.8" />
4940
  </g>
4941
  </g>
4942
  <g id="text_26">
4943
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="297.171035" transform="rotate(-0 40.72 297.171035)">0.2</text>
4944
  </g>
4945
  </g>
4946
  <g id="ytick_3">
4947
  <g id="grid-y--4" class="grid grid-y">
4948
+ <path d="M 47.72 209.614648 L 831.034248 209.614648 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4949
  </g>
4950
  <g id="line2d_27">
4951
  <g>
4952
+ <use ns4:href="#m0fca2865ba" x="47.72" y="209.614648" style="stroke: #000000; stroke-width: 0.8" />
4953
  </g>
4954
  </g>
4955
  <g id="text_27">
4956
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="213.413867" transform="rotate(-0 40.72 213.413867)">0.3</text>
4957
  </g>
4958
  </g>
4959
  <g id="ytick_4">
4960
  <g id="grid-y--5" class="grid grid-y">
4961
+ <path d="M 47.72 125.857479 L 831.034248 125.857479 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4962
  </g>
4963
  <g id="line2d_28">
4964
  <g>
4965
+ <use ns4:href="#m0fca2865ba" x="47.72" y="125.857479" style="stroke: #000000; stroke-width: 0.8" />
4966
  </g>
4967
  </g>
4968
  <g id="text_28">
4969
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="129.656698" transform="rotate(-0 40.72 129.656698)">0.4</text>
4970
  </g>
4971
  </g>
4972
  <g id="ytick_5">
4973
  <g id="grid-y--6" class="grid grid-y">
4974
+ <path d="M 47.72 42.100311 L 831.034248 42.100311 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4975
  </g>
4976
  <g id="line2d_29">
4977
  <g>
4978
+ <use ns4:href="#m0fca2865ba" x="47.72" y="42.100311" style="stroke: #000000; stroke-width: 0.8" />
4979
  </g>
4980
  </g>
4981
  <g id="text_29">
4982
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="45.89953" transform="rotate(-0 40.72 45.89953)">0.5</text>
4983
  </g>
4984
  </g>
4985
  <g id="label--y" class="ylabel">
 
4987
  </g>
4988
  </g>
4989
  <g id="series--hf-kernels-causal-conv1d" class="series">
4990
+ <path d="M 83.325193 420.186871 L 114.286231 415.137151 L 145.247268 415.35492 L 176.208306 416.618815 L 207.169343 415.505682 L 238.130381 415.396798 L 269.091418 416.242746 L 300.052455 416.602901 L 331.013493 416.954681 L 361.97453 416.70341 L 392.935568 416.653156 L 423.896605 416.451301 L 454.857643 416.686659 L 485.81868 416.728537 L 516.779718 416.52752 L 547.740755 416.485641 L 578.701793 416.594526 L 609.66283 416.275411 L 640.623868 416.158151 L 671.584905 415.153902 L 702.545943 416.711786 L 733.50698 416.225994 L 764.468018 416.912803 L 795.429055 416.368381 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4991
  <defs>
4992
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4993
  </defs>
4994
  <g clip-path="url(#pb49fc4c8d2)">
4995
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4996
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="415.137151" style="fill: #1f77b4; stroke: #1f77b4" />
4997
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="415.35492" style="fill: #1f77b4; stroke: #1f77b4" />
4998
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="416.618815" style="fill: #1f77b4; stroke: #1f77b4" />
4999
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="415.505682" style="fill: #1f77b4; stroke: #1f77b4" />
5000
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="415.396798" style="fill: #1f77b4; stroke: #1f77b4" />
5001
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="416.242746" style="fill: #1f77b4; stroke: #1f77b4" />
5002
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="416.602901" style="fill: #1f77b4; stroke: #1f77b4" />
5003
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="416.954681" style="fill: #1f77b4; stroke: #1f77b4" />
5004
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="416.70341" style="fill: #1f77b4; stroke: #1f77b4" />
5005
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="416.653156" style="fill: #1f77b4; stroke: #1f77b4" />
5006
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="416.451301" style="fill: #1f77b4; stroke: #1f77b4" />
5007
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="416.686659" style="fill: #1f77b4; stroke: #1f77b4" />
5008
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="416.728537" style="fill: #1f77b4; stroke: #1f77b4" />
5009
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="416.52752" style="fill: #1f77b4; stroke: #1f77b4" />
5010
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="416.485641" style="fill: #1f77b4; stroke: #1f77b4" />
5011
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="416.594526" style="fill: #1f77b4; stroke: #1f77b4" />
5012
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="416.275411" style="fill: #1f77b4; stroke: #1f77b4" />
5013
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="416.158151" style="fill: #1f77b4; stroke: #1f77b4" />
5014
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="415.153902" style="fill: #1f77b4; stroke: #1f77b4" />
5015
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="416.711786" style="fill: #1f77b4; stroke: #1f77b4" />
5016
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="416.225994" style="fill: #1f77b4; stroke: #1f77b4" />
5017
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="416.912803" style="fill: #1f77b4; stroke: #1f77b4" />
5018
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="416.368381" style="fill: #1f77b4; stroke: #1f77b4" />
5019
  </g>
5020
  </g>
5021
  <g id="series--torch-eager" class="series">
5022
+ <path d="M 83.325193 400.311295 L 114.286231 385.754299 L 145.247268 385.905062 L 176.208306 386.851518 L 207.169343 386.441107 L 238.130381 387.194922 L 269.091418 387.194922 L 300.052455 387.161419 L 331.013493 387.211673 L 361.97453 387.647211 L 392.935568 339.58651 L 423.896605 324.619104 L 454.857643 388.040869 L 485.81868 388.342395 L 516.779718 387.915234 L 547.740755 387.806349 L 578.701793 387.086038 L 609.66283 387.186546 L 640.623868 387.429442 L 671.584905 388.216759 L 702.545943 379.824291 L 733.50698 375.36841 L 764.468018 53.322934 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5023
  <defs>
5024
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5025
  </defs>
5026
  <g clip-path="url(#pb49fc4c8d2)">
5027
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="400.311295" style="fill: #ff7f0e; stroke: #ff7f0e" />
5028
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="385.754299" style="fill: #ff7f0e; stroke: #ff7f0e" />
5029
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="385.905062" style="fill: #ff7f0e; stroke: #ff7f0e" />
5030
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="386.851518" style="fill: #ff7f0e; stroke: #ff7f0e" />
5031
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="386.441107" style="fill: #ff7f0e; stroke: #ff7f0e" />
5032
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="387.194922" style="fill: #ff7f0e; stroke: #ff7f0e" />
5033
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="387.194922" style="fill: #ff7f0e; stroke: #ff7f0e" />
5034
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="387.161419" style="fill: #ff7f0e; stroke: #ff7f0e" />
5035
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="387.211673" style="fill: #ff7f0e; stroke: #ff7f0e" />
5036
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="387.647211" style="fill: #ff7f0e; stroke: #ff7f0e" />
5037
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="339.58651" style="fill: #ff7f0e; stroke: #ff7f0e" />
5038
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="324.619104" style="fill: #ff7f0e; stroke: #ff7f0e" />
5039
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="388.040869" style="fill: #ff7f0e; stroke: #ff7f0e" />
5040
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="388.342395" style="fill: #ff7f0e; stroke: #ff7f0e" />
5041
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="387.915234" style="fill: #ff7f0e; stroke: #ff7f0e" />
5042
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="387.806349" style="fill: #ff7f0e; stroke: #ff7f0e" />
5043
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="387.086038" style="fill: #ff7f0e; stroke: #ff7f0e" />
5044
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="387.186546" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="387.429442" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="388.216759" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="379.824291" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="375.36841" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="53.322934" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
  </g>
5052
  </g>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9715130000245153, "p50": 0.9773340000265307, "p90": 0.9788430000412518, "mean": 0.976309200018477, "iqr": 0.005310000005920301, "raw_times": [0.9735330000353315, 0.9773340000265307, 0.9803229999647556, 0.9788430000412518, 0.9715130000245153], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9926440000072034, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0154749999742307, "p50": 1.0199449999959143, "p90": 1.0278160000325443, "mean": 1.0223952000046665, "iqr": 0.010921000011876458, "raw_times": [1.0278160000325443, 1.0168950000206678, 1.0318449999999757, 1.0154749999742307, 1.0199449999959143], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0225849999869752, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0612160000391668, "p50": 1.0721770000259312, "p90": 1.075397000022349, "mean": 1.0706886000093618, "iqr": 0.009251000051335723, "raw_times": [1.0612160000391668, 1.0721770000259312, 1.0661459999710132, 1.075397000022349, 1.078506999988349], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0771669999485312, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.075485999990633, "p50": 1.0823069999901236, "p90": 1.084176999995634, "mean": 1.0827727999981107, "iqr": 0.0021099999685247894, "raw_times": [1.075485999990633, 1.0820670000271093, 1.0823069999901236, 1.0898269999870536, 1.084176999995634], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1057869999717695, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2330920000067636, "p50": 1.237381999999343, "p90": 1.239422999958606, "mean": 1.2375224000038543, "iqr": 0.002220999931523693, "raw_times": [1.2405130000274767, 1.2372020000270822, 1.2330920000067636, 1.237381999999343, 1.239422999958606], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.22687200001792, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-29T00:37:11Z", "run": "205c3b57a6af43a99a29d290f8e9aef2", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2296720000222194, "p50": 1.230811999960224, "p90": 1.236231999996562, "mean": 1.2357499999893662, "iqr": 0.005929999986165058, "raw_times": [1.236231999996562, 1.2517319999574283, 1.230811999960224, 1.230302000010397, 1.2296720000222194], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2250920000269616, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2110259999644768, "p50": 1.2139859999820146, "p90": 1.214847000028385, "mean": 1.2134921999972903, "iqr": 0.002731000051880983, "raw_times": [1.2139859999820146, 1.212115999976504, 1.215486000035071, 1.214847000028385, 1.2110259999644768], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2041449999742326, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2489469999650282, "p50": 1.2594169999715632, "p90": 1.2745669999958409, "mean": 1.2628269999936492, "iqr": 0.020920999986628885, "raw_times": [1.2489469999650282, 1.2594169999715632, 1.253646000009212, 1.2745669999958409, 1.2775580000266018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2641869999470146, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-29T04:14:30Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2830869999902461, "p50": 1.288437999960479, "p90": 1.2899880000531994, "mean": 1.287595600001623, "iqr": 0.0053310000680539815, "raw_times": [1.2846569999851454, 1.2899880000531994, 1.288437999960479, 1.2918080000190457, 1.2830869999902461], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2736869999798728, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.3104980000093747, "p50": 1.3190589999680924, "p90": 1.3191280000341976, "mean": 1.3194864000070083, "iqr": 0.002650000055837154, "raw_times": [1.3104980000093747, 1.3322690000450166, 1.3190589999680924, 1.3164779999783605, 1.3191280000341976], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.3112579999869922, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.470441000037681, "p50": 1.4753519999999298, "p90": 1.4777020000451557, "mean": 1.4751576000094246, "iqr": 0.00719000007620707, "raw_times": [1.4777020000451557, 1.4753519999999298, 1.481780999995408, 1.470441000037681, 1.4705119999689487], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.4788519999910932, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-29T04:14:31Z", "run": "f13d7fc2a96a4974bd950f1b5546eb33", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4914220000150635, "p50": 1.5084219999721427, "p90": 1.5101930000014363, "mean": 1.5035721999993257, "iqr": 0.01671100000066872, "raw_times": [1.4914220000150635, 1.4934820000007676, 1.5084219999721427, 1.5101930000014363, 1.5143420000072183], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.497162000021035, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "xformers",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,18 +12,18 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- import xformers.ops as xops
17
 
18
 
19
- def xformers_attention(q, k, v):
20
- """xFormers memory efficient attention"""
21
- # xFormers expects [batch, seq_len, heads, head_dim]
22
- return xops.memory_efficient_attention(q, k, v)
 
23
 
24
 
25
  run_benchmark(
26
  kernel_type=KernelTypeEnum.ATTENTION,
27
- impl_name="xformers_meff",
28
- impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
29
- impl_func=xformers_attention,
30
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
16
 
17
+ def torch_flash(q, k, v):
18
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
19
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
20
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
21
+ return o.transpose(1, 2).contiguous()
22
 
23
 
24
  run_benchmark(
25
  kernel_type=KernelTypeEnum.ATTENTION,
26
+ impl_name="torch_flash_ma",
27
+ impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
28
+ impl_func=torch_flash,
29
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
- <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 00:36:31 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
- | N/A 32C P0 151W / 350W | 0MiB / 46068MiB | 86% Default |
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.578ms 102.17% 3.578ms 3.578ms 1
3976
- torch_flash_ma 6.87% 353.422us 46.38% 2.386ms 2.386ms 0.000us 0.00% 3.542ms 3.542ms 1
3977
- aten::scaled_dot_product_attention 0.81% 41.691us 4.31% 221.887us 73.962us 0.000us 0.00% 2.788ms 929.262us 3
3978
- aten::_scaled_dot_product_flash_attention 0.53% 27.420us 3.50% 180.196us 60.065us 0.000us 0.00% 2.788ms 929.262us 3
3979
- aten::_flash_attention_forward 0.77% 39.803us 2.56% 131.456us 43.819us 2.788ms 79.61% 2.788ms 929.262us 3
3980
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.788ms 79.61% 2.788ms 929.262us 3
3981
- aten::contiguous 0.28% 14.581us 33.97% 1.748ms 145.626us 0.000us 0.00% 754.272us 62.856us 12
3982
- aten::clone 0.77% 39.360us 33.69% 1.733ms 144.411us 0.000us 0.00% 754.272us 62.856us 12
3983
- aten::copy_ 1.64% 84.313us 31.38% 1.614ms 134.494us 713.920us 20.39% 754.272us 62.856us 12
3984
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 713.920us 20.39% 713.920us 59.493us 12
3985
- Activity Buffer Request 27.68% 1.424ms 27.68% 1.424ms 1.424ms 40.352us 1.15% 40.352us 40.352us 1
3986
- aten::transpose 1.22% 62.617us 1.64% 84.135us 3.506us 0.000us 0.00% 0.000us 0.000us 24
3987
- aten::as_strided 0.42% 21.518us 0.42% 21.518us 0.897us 0.000us 0.00% 0.000us 0.000us 24
3988
- aten::empty_like 0.49% 25.079us 1.99% 102.243us 6.816us 0.000us 0.00% 0.000us 0.000us 15
3989
- aten::empty 1.77% 91.033us 1.77% 91.033us 3.793us 0.000us 0.00% 0.000us 0.000us 24
3990
- cudaLaunchKernel 2.57% 132.402us 2.57% 132.402us 8.827us 0.000us 0.00% 0.000us 0.000us 15
3991
- aten::empty_strided 0.32% 16.702us 0.32% 16.702us 5.567us 0.000us 0.00% 0.000us 0.000us 3
3992
- cudaDeviceGetAttribute 0.05% 2.750us 0.05% 2.750us 0.458us 0.000us 0.00% 0.000us 0.000us 6
3993
- cudaFuncSetAttribute 0.17% 9.001us 0.17% 9.001us 3.000us 0.000us 0.00% 0.000us 0.000us 3
3994
- cudaDeviceSynchronize 53.62% 2.758ms 53.62% 2.758ms 2.758ms 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
- Self CPU time total: 5.144ms
3997
- Self CUDA time total: 3.502ms
3998
 
3999
 
4000
 
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- torch_flash_ma 4.93% 257.698us 42.06% 2.199ms 2.199ms 0.000us 0.00% 3.742ms 3.742ms 1
4008
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.698ms 100.30% 3.698ms 3.698ms 1
4009
- aten::scaled_dot_product_attention 0.48% 25.212us 3.48% 182.067us 60.689us 0.000us 0.00% 2.929ms 976.488us 3
4010
- aten::_scaled_dot_product_flash_attention 0.39% 20.471us 3.00% 156.855us 52.285us 0.000us 0.00% 2.929ms 976.488us 3
4011
- aten::_flash_attention_forward 0.74% 38.430us 2.18% 114.074us 38.025us 2.929ms 79.45% 2.929ms 976.488us 3
4012
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.929ms 79.45% 2.929ms 976.488us 3
4013
- aten::contiguous 0.17% 9.122us 32.76% 1.713ms 142.713us 0.000us 0.00% 812.318us 67.693us 12
4014
- aten::clone 0.59% 31.068us 32.59% 1.703ms 141.953us 0.000us 0.00% 812.318us 67.693us 12
4015
- aten::copy_ 1.50% 78.513us 30.83% 1.612ms 134.315us 757.726us 20.55% 812.318us 67.693us 12
4016
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 757.726us 20.55% 757.726us 63.144us 12
4017
- Activity Buffer Request 27.74% 1.450ms 27.74% 1.450ms 1.450ms 54.592us 1.48% 54.592us 54.592us 1
4018
- aten::transpose 0.99% 51.637us 1.32% 68.781us 2.866us 0.000us 0.00% 0.000us 0.000us 24
4019
- aten::as_strided 0.33% 17.144us 0.33% 17.144us 0.714us 0.000us 0.00% 0.000us 0.000us 24
4020
- aten::empty_like 0.41% 21.274us 1.52% 79.248us 5.283us 0.000us 0.00% 0.000us 0.000us 15
4021
- aten::empty 1.40% 73.206us 1.40% 73.206us 3.050us 0.000us 0.00% 0.000us 0.000us 24
4022
- cudaLaunchKernel 2.03% 106.061us 2.03% 106.061us 7.071us 0.000us 0.00% 0.000us 0.000us 15
4023
- aten::empty_strided 0.26% 13.410us 0.26% 13.410us 4.470us 0.000us 0.00% 0.000us 0.000us 3
4024
  cudaDeviceGetAttribute 0.04% 1.900us 0.04% 1.900us 0.317us 0.000us 0.00% 0.000us 0.000us 6
4025
- cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
4026
- cudaDeviceSynchronize 57.94% 3.028ms 57.94% 3.028ms 3.028ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 5.227ms
4029
- Self CUDA time total: 3.687ms
4030
 
4031
 
4032
 
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- torch_flash_ma 4.92% 259.759us 41.31% 2.182ms 2.182ms 0.000us 0.00% 3.825ms 3.825ms 1
4040
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.778ms 100.30% 3.778ms 3.778ms 1
4041
- aten::scaled_dot_product_attention 0.46% 24.480us 3.48% 183.685us 61.228us 0.000us 0.00% 2.990ms 996.566us 3
4042
- aten::_scaled_dot_product_flash_attention 0.36% 18.972us 3.01% 159.205us 53.068us 0.000us 0.00% 2.990ms 996.566us 3
4043
- aten::_flash_attention_forward 0.75% 39.470us 2.21% 116.583us 38.861us 2.990ms 79.38% 2.990ms 996.566us 3
4044
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.990ms 79.38% 2.990ms 996.566us 3
4045
- aten::contiguous 0.20% 10.370us 32.06% 1.693ms 141.118us 0.000us 0.00% 835.605us 69.634us 12
4046
- aten::clone 0.56% 29.562us 31.86% 1.683ms 140.254us 0.000us 0.00% 835.605us 69.634us 12
4047
- aten::copy_ 1.55% 81.613us 30.00% 1.585ms 132.057us 776.758us 20.62% 835.605us 69.634us 12
4048
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 776.758us 20.62% 776.758us 64.730us 12
4049
- Activity Buffer Request 26.94% 1.423ms 26.94% 1.423ms 1.423ms 58.847us 1.56% 58.847us 58.847us 1
4050
- aten::transpose 0.97% 51.460us 1.30% 68.660us 2.861us 0.000us 0.00% 0.000us 0.000us 24
4051
- aten::as_strided 0.33% 17.200us 0.33% 17.200us 0.717us 0.000us 0.00% 0.000us 0.000us 24
4052
- aten::empty_like 0.39% 20.693us 1.67% 88.333us 5.889us 0.000us 0.00% 0.000us 0.000us 15
4053
- aten::empty 1.54% 81.451us 1.54% 81.451us 3.394us 0.000us 0.00% 0.000us 0.000us 24
4054
- cudaLaunchKernel 1.97% 104.004us 1.97% 104.004us 6.934us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_strided 0.28% 14.530us 0.28% 14.530us 4.843us 0.000us 0.00% 0.000us 0.000us 3
4056
- cudaDeviceGetAttribute 0.04% 1.902us 0.04% 1.902us 0.317us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaFuncSetAttribute 0.07% 3.600us 0.07% 3.600us 1.200us 0.000us 0.00% 0.000us 0.000us 3
4058
- cudaDeviceSynchronize 58.69% 3.100ms 58.69% 3.100ms 3.100ms 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
- Self CPU time total: 5.282ms
4061
- Self CUDA time total: 3.766ms
4062
 
4063
 
4064
 
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- torch_flash_ma 4.63% 260.119us 43.14% 2.422ms 2.422ms 0.000us 0.00% 3.911ms 3.911ms 1
4072
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.865ms 100.31% 3.865ms 3.865ms 1
4073
- aten::scaled_dot_product_attention 0.43% 24.361us 3.22% 180.586us 60.195us 0.000us 0.00% 3.069ms 1.023ms 3
4074
- aten::_scaled_dot_product_flash_attention 0.35% 19.401us 2.78% 156.225us 52.075us 0.000us 0.00% 3.069ms 1.023ms 3
4075
- aten::_flash_attention_forward 0.68% 38.111us 2.03% 114.053us 38.018us 3.069ms 79.64% 3.069ms 1.023ms 3
4076
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.069ms 79.64% 3.069ms 1.023ms 3
4077
- aten::contiguous 0.17% 9.669us 34.46% 1.935ms 161.211us 0.000us 0.00% 842.147us 70.179us 12
4078
- aten::clone 0.54% 30.453us 34.29% 1.925ms 160.405us 0.000us 0.00% 842.147us 70.179us 12
4079
- aten::copy_ 1.42% 79.471us 32.63% 1.832ms 152.656us 784.675us 20.36% 842.147us 70.179us 12
4080
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.675us 20.36% 784.675us 65.390us 12
4081
- Activity Buffer Request 26.20% 1.471ms 26.20% 1.471ms 1.471ms 57.472us 1.49% 57.472us 57.472us 1
4082
- aten::transpose 0.92% 51.697us 1.23% 69.261us 2.886us 0.000us 0.00% 0.000us 0.000us 24
4083
- aten::as_strided 0.31% 17.564us 0.31% 17.564us 0.732us 0.000us 0.00% 0.000us 0.000us 24
4084
- aten::empty_like 0.36% 20.299us 1.45% 81.452us 5.430us 0.000us 0.00% 0.000us 0.000us 15
4085
- aten::empty 1.34% 75.405us 1.34% 75.405us 3.142us 0.000us 0.00% 0.000us 0.000us 24
4086
- cudaLaunchKernel 5.43% 304.654us 5.43% 304.654us 20.310us 0.000us 0.00% 0.000us 0.000us 15
4087
- aten::empty_strided 0.25% 13.960us 0.25% 13.960us 4.653us 0.000us 0.00% 0.000us 0.000us 3
4088
- cudaDeviceGetAttribute 0.03% 1.839us 0.03% 1.839us 0.306us 0.000us 0.00% 0.000us 0.000us 6
4089
- cudaFuncSetAttribute 0.07% 3.750us 0.07% 3.750us 1.250us 0.000us 0.00% 0.000us 0.000us 3
4090
- cudaDeviceSynchronize 56.86% 3.192ms 56.86% 3.192ms 3.192ms 0.000us 0.00% 0.000us 0.000us 1
4091
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
- Self CPU time total: 5.614ms
4093
- Self CUDA time total: 3.854ms
4094
 
4095
 
4096
 
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
- torch_flash_ma 5.20% 312.192us 40.27% 2.420ms 2.420ms 0.000us 0.00% 4.370ms 4.370ms 1
4104
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.320ms 100.26% 4.320ms 4.320ms 1
4105
- aten::scaled_dot_product_attention 0.42% 25.401us 3.13% 188.317us 62.772us 0.000us 0.00% 3.499ms 1.166ms 3
4106
- aten::_scaled_dot_product_flash_attention 0.34% 20.373us 2.71% 162.916us 54.305us 0.000us 0.00% 3.499ms 1.166ms 3
4107
- aten::_flash_attention_forward 0.70% 41.822us 1.99% 119.463us 39.821us 3.499ms 81.21% 3.499ms 1.166ms 3
4108
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.499ms 81.21% 3.499ms 1.166ms 3
4109
- aten::contiguous 0.17% 10.061us 31.18% 1.873ms 156.120us 0.000us 0.00% 870.813us 72.568us 12
4110
- aten::clone 0.51% 30.510us 31.01% 1.863ms 155.281us 0.000us 0.00% 870.813us 72.568us 12
4111
- aten::copy_ 1.32% 79.253us 29.46% 1.770ms 147.488us 809.726us 18.79% 870.813us 72.568us 12
4112
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 809.726us 18.79% 809.726us 67.477us 12
4113
- Activity Buffer Request 23.71% 1.425ms 23.71% 1.425ms 1.425ms 61.087us 1.42% 61.087us 61.087us 1
4114
- aten::transpose 0.85% 51.371us 1.15% 68.940us 2.873us 0.000us 0.00% 0.000us 0.000us 24
4115
- aten::as_strided 0.29% 17.569us 0.29% 17.569us 0.732us 0.000us 0.00% 0.000us 0.000us 24
4116
- aten::empty_like 0.34% 20.420us 1.39% 83.415us 5.561us 0.000us 0.00% 0.000us 0.000us 15
4117
- aten::empty 1.27% 76.235us 1.27% 76.235us 3.176us 0.000us 0.00% 0.000us 0.000us 24
4118
- cudaLaunchKernel 4.81% 288.717us 4.81% 288.717us 19.248us 0.000us 0.00% 0.000us 0.000us 15
4119
- aten::empty_strided 0.26% 15.360us 0.26% 15.360us 5.120us 0.000us 0.00% 0.000us 0.000us 3
4120
- cudaDeviceGetAttribute 0.03% 1.980us 0.03% 1.980us 0.330us 0.000us 0.00% 0.000us 0.000us 6
4121
- cudaFuncSetAttribute 0.06% 3.780us 0.06% 3.780us 1.260us 0.000us 0.00% 0.000us 0.000us 3
4122
- cudaDeviceSynchronize 59.73% 3.589ms 59.73% 3.589ms 3.589ms 0.000us 0.00% 0.000us 0.000us 1
4123
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4124
- Self CPU time total: 6.009ms
4125
- Self CUDA time total: 4.309ms
4126
 
4127
 
4128
 
@@ -4132,38 +4132,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
- torch_flash_ma 4.62% 283.749us 39.30% 2.416ms 2.416ms 0.000us 0.00% 4.488ms 4.488ms 1
4136
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.437ms 100.26% 4.437ms 4.437ms 1
4137
- aten::scaled_dot_product_attention 0.41% 25.050us 2.99% 183.606us 61.202us 0.000us 0.00% 3.606ms 1.202ms 3
4138
- aten::_scaled_dot_product_flash_attention 0.32% 19.512us 2.58% 158.556us 52.852us 0.000us 0.00% 3.606ms 1.202ms 3
4139
- aten::_flash_attention_forward 0.64% 39.583us 1.89% 116.223us 38.741us 3.606ms 81.47% 3.606ms 1.202ms 3
4140
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.606ms 81.47% 3.606ms 1.202ms 3
4141
- aten::contiguous 0.16% 9.930us 30.93% 1.901ms 158.420us 0.000us 0.00% 882.206us 73.517us 12
4142
- aten::clone 0.49% 30.220us 30.76% 1.891ms 157.592us 0.000us 0.00% 882.206us 73.517us 12
4143
- aten::copy_ 1.34% 82.326us 29.23% 1.797ms 149.726us 820.351us 18.53% 882.206us 73.517us 12
4144
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 820.351us 18.53% 820.351us 68.363us 12
4145
- Activity Buffer Request 23.42% 1.439ms 23.42% 1.439ms 1.439ms 61.855us 1.40% 61.855us 61.855us 1
4146
- aten::transpose 0.85% 52.248us 1.14% 70.082us 2.920us 0.000us 0.00% 0.000us 0.000us 24
4147
- aten::as_strided 0.29% 17.834us 0.29% 17.834us 0.743us 0.000us 0.00% 0.000us 0.000us 24
4148
- aten::empty_like 0.33% 20.531us 1.36% 83.782us 5.585us 0.000us 0.00% 0.000us 0.000us 15
4149
- aten::empty 1.26% 77.251us 1.26% 77.251us 3.219us 0.000us 0.00% 0.000us 0.000us 24
4150
- cudaLaunchKernel 4.84% 297.592us 4.84% 297.592us 19.839us 0.000us 0.00% 0.000us 0.000us 15
4151
- aten::empty_strided 0.24% 14.660us 0.24% 14.660us 4.887us 0.000us 0.00% 0.000us 0.000us 3
4152
- cudaDeviceGetAttribute 0.03% 1.929us 0.03% 1.929us 0.321us 0.000us 0.00% 0.000us 0.000us 6
4153
- cudaFuncSetAttribute 0.06% 3.839us 0.06% 3.839us 1.280us 0.000us 0.00% 0.000us 0.000us 3
4154
- cudaDeviceSynchronize 60.70% 3.731ms 60.70% 3.731ms 3.731ms 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
- Self CPU time total: 6.147ms
4157
- Self CUDA time total: 4.426ms
4158
 
4159
 
4160
  impl wl p50(ms) ok
4161
  torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4162
- torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4163
  torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4164
  torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4165
- torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4166
- torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4167
  </pre></div>
4168
  <div class="cell-artifacts">
4169
  <h4>Artifacts:</h4>
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.21s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 04:14:27 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
 
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
+ | N/A 36C P0 80W / 350W | 0MiB / 46068MiB | 11% Default |
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
 
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.590ms 102.17% 3.590ms 3.590ms 1
3976
+ torch_flash_ma 6.85% 354.470us 47.44% 2.454ms 2.454ms 0.000us 0.00% 3.554ms 3.554ms 1
3977
+ aten::scaled_dot_product_attention 0.84% 43.371us 4.38% 226.614us 75.538us 0.000us 0.00% 2.798ms 932.564us 3
3978
+ aten::_scaled_dot_product_flash_attention 0.52% 27.141us 3.54% 183.243us 61.081us 0.000us 0.00% 2.798ms 932.564us 3
3979
+ aten::_flash_attention_forward 0.84% 43.539us 2.59% 134.122us 44.707us 2.798ms 79.63% 2.798ms 932.564us 3
3980
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.798ms 79.63% 2.798ms 932.564us 3
3981
+ aten::contiguous 0.29% 14.889us 34.84% 1.803ms 150.217us 0.000us 0.00% 755.939us 62.995us 12
3982
+ aten::clone 0.79% 40.742us 34.56% 1.788ms 148.977us 0.000us 0.00% 755.939us 62.995us 12
3983
+ aten::copy_ 1.80% 93.020us 31.59% 1.634ms 136.197us 715.586us 20.37% 755.939us 62.995us 12
3984
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.586us 20.37% 715.586us 59.632us 12
3985
+ Activity Buffer Request 27.64% 1.430ms 27.64% 1.430ms 1.430ms 40.353us 1.15% 40.353us 40.353us 1
3986
+ aten::transpose 1.35% 70.048us 1.79% 92.780us 3.866us 0.000us 0.00% 0.000us 0.000us 24
3987
+ aten::as_strided 0.44% 22.732us 0.44% 22.732us 0.947us 0.000us 0.00% 0.000us 0.000us 24
3988
+ aten::empty_like 0.49% 25.480us 2.63% 136.134us 9.076us 0.000us 0.00% 0.000us 0.000us 15
3989
+ aten::empty 2.37% 122.383us 2.37% 122.383us 5.099us 0.000us 0.00% 0.000us 0.000us 24
3990
+ cudaLaunchKernel 2.63% 136.154us 2.63% 136.154us 9.077us 0.000us 0.00% 0.000us 0.000us 15
3991
+ aten::empty_strided 0.35% 17.861us 0.35% 17.861us 5.954us 0.000us 0.00% 0.000us 0.000us 3
3992
+ cudaDeviceGetAttribute 0.05% 2.732us 0.05% 2.732us 0.455us 0.000us 0.00% 0.000us 0.000us 6
3993
+ cudaFuncSetAttribute 0.19% 10.040us 0.19% 10.040us 3.347us 0.000us 0.00% 0.000us 0.000us 3
3994
+ cudaDeviceSynchronize 52.56% 2.719ms 52.56% 2.719ms 2.719ms 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ Self CPU time total: 5.174ms
3997
+ Self CUDA time total: 3.513ms
3998
 
3999
 
4000
 
 
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ torch_flash_ma 5.13% 269.966us 42.38% 2.232ms 2.232ms 0.000us 0.00% 3.778ms 3.778ms 1
4008
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.734ms 100.30% 3.734ms 3.734ms 1
4009
+ aten::scaled_dot_product_attention 0.51% 26.890us 3.58% 188.304us 62.768us 0.000us 0.00% 2.960ms 986.590us 3
4010
+ aten::_scaled_dot_product_flash_attention 0.35% 18.589us 3.07% 161.414us 53.805us 0.000us 0.00% 2.960ms 986.590us 3
4011
+ aten::_flash_attention_forward 0.78% 41.299us 2.29% 120.413us 40.138us 2.960ms 79.51% 2.960ms 986.590us 3
4012
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.960ms 79.51% 2.960ms 986.590us 3
4013
+ aten::contiguous 0.18% 9.501us 32.77% 1.726ms 143.802us 0.000us 0.00% 818.206us 68.184us 12
4014
+ aten::clone 0.54% 28.568us 32.59% 1.716ms 143.010us 0.000us 0.00% 818.206us 68.184us 12
4015
+ aten::copy_ 1.52% 80.181us 30.79% 1.621ms 135.119us 762.846us 20.49% 818.206us 68.184us 12
4016
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 762.846us 20.49% 762.846us 63.571us 12
4017
+ Activity Buffer Request 27.52% 1.449ms 27.52% 1.449ms 1.449ms 55.360us 1.49% 55.360us 55.360us 1
4018
+ aten::transpose 1.00% 52.915us 1.33% 70.084us 2.920us 0.000us 0.00% 0.000us 0.000us 24
4019
+ aten::as_strided 0.33% 17.169us 0.33% 17.169us 0.715us 0.000us 0.00% 0.000us 0.000us 24
4020
+ aten::empty_like 0.39% 20.652us 1.64% 86.425us 5.762us 0.000us 0.00% 0.000us 0.000us 15
4021
+ aten::empty 1.51% 79.433us 1.51% 79.433us 3.310us 0.000us 0.00% 0.000us 0.000us 24
4022
+ cudaLaunchKernel 2.18% 114.743us 2.18% 114.743us 7.650us 0.000us 0.00% 0.000us 0.000us 15
4023
+ aten::empty_strided 0.29% 15.331us 0.29% 15.331us 5.110us 0.000us 0.00% 0.000us 0.000us 3
4024
  cudaDeviceGetAttribute 0.04% 1.900us 0.04% 1.900us 0.317us 0.000us 0.00% 0.000us 0.000us 6
4025
+ cudaFuncSetAttribute 0.10% 5.520us 0.10% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaDeviceSynchronize 57.62% 3.034ms 57.62% 3.034ms 3.034ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 5.265ms
4029
+ Self CUDA time total: 3.723ms
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ torch_flash_ma 5.04% 266.137us 41.64% 2.197ms 2.197ms 0.000us 0.00% 3.820ms 3.820ms 1
4040
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.772ms 100.29% 3.772ms 3.772ms 1
4041
+ aten::scaled_dot_product_attention 0.49% 25.880us 3.59% 189.194us 63.065us 0.000us 0.00% 2.983ms 994.205us 3
4042
+ aten::_scaled_dot_product_flash_attention 0.37% 19.363us 3.10% 163.314us 54.438us 0.000us 0.00% 2.983ms 994.205us 3
4043
+ aten::_flash_attention_forward 0.81% 42.782us 2.31% 121.862us 40.621us 2.983ms 79.31% 2.983ms 994.205us 3
4044
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.983ms 79.31% 2.983ms 994.205us 3
4045
+ aten::contiguous 0.18% 9.290us 32.12% 1.695ms 141.255us 0.000us 0.00% 836.990us 69.749us 12
4046
+ aten::clone 0.53% 27.791us 31.95% 1.686ms 140.481us 0.000us 0.00% 836.990us 69.749us 12
4047
+ aten::copy_ 1.57% 82.879us 30.22% 1.595ms 132.896us 778.238us 20.69% 836.990us 69.749us 12
4048
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 778.238us 20.69% 778.238us 64.853us 12
4049
+ Activity Buffer Request 26.92% 1.420ms 26.92% 1.420ms 1.420ms 58.752us 1.56% 58.752us 58.752us 1
4050
+ aten::transpose 0.98% 51.581us 1.30% 68.820us 2.868us 0.000us 0.00% 0.000us 0.000us 24
4051
+ aten::as_strided 0.33% 17.239us 0.33% 17.239us 0.718us 0.000us 0.00% 0.000us 0.000us 24
4052
+ aten::empty_like 0.35% 18.669us 1.58% 83.581us 5.572us 0.000us 0.00% 0.000us 0.000us 15
4053
+ aten::empty 1.49% 78.372us 1.49% 78.372us 3.265us 0.000us 0.00% 0.000us 0.000us 24
4054
+ cudaLaunchKernel 2.17% 114.523us 2.17% 114.523us 7.635us 0.000us 0.00% 0.000us 0.000us 15
4055
+ aten::empty_strided 0.29% 15.511us 0.29% 15.511us 5.170us 0.000us 0.00% 0.000us 0.000us 3
4056
+ cudaDeviceGetAttribute 0.04% 2.300us 0.04% 2.300us 0.383us 0.000us 0.00% 0.000us 0.000us 6
4057
+ cudaFuncSetAttribute 0.09% 4.560us 0.09% 4.560us 1.520us 0.000us 0.00% 0.000us 0.000us 3
4058
+ cudaDeviceSynchronize 58.36% 3.079ms 58.36% 3.079ms 3.079ms 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ Self CPU time total: 5.277ms
4061
+ Self CUDA time total: 3.761ms
4062
 
4063
 
4064
 
 
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ torch_flash_ma 4.81% 269.664us 43.38% 2.432ms 2.432ms 0.000us 0.00% 3.921ms 3.921ms 1
4072
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.875ms 100.29% 3.875ms 3.875ms 1
4073
+ aten::scaled_dot_product_attention 0.47% 26.530us 3.32% 186.254us 62.085us 0.000us 0.00% 3.079ms 1.026ms 3
4074
+ aten::_scaled_dot_product_flash_attention 0.33% 18.670us 2.85% 159.724us 53.241us 0.000us 0.00% 3.079ms 1.026ms 3
4075
+ aten::_flash_attention_forward 0.73% 41.012us 2.12% 118.963us 39.654us 3.079ms 79.68% 3.079ms 1.026ms 3
4076
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.079ms 79.68% 3.079ms 1.026ms 3
4077
+ aten::contiguous 0.17% 9.411us 34.39% 1.928ms 160.703us 0.000us 0.00% 842.199us 70.183us 12
4078
+ aten::clone 0.52% 28.883us 34.22% 1.919ms 159.919us 0.000us 0.00% 842.199us 70.183us 12
4079
+ aten::copy_ 1.48% 82.822us 32.55% 1.825ms 152.123us 784.952us 20.32% 842.199us 70.183us 12
4080
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 784.952us 20.32% 784.952us 65.413us 12
4081
+ Activity Buffer Request 25.77% 1.445ms 25.77% 1.445ms 1.445ms 57.247us 1.48% 57.247us 57.247us 1
4082
+ aten::transpose 0.94% 52.967us 1.25% 70.184us 2.924us 0.000us 0.00% 0.000us 0.000us 24
4083
+ aten::as_strided 0.31% 17.217us 0.31% 17.217us 0.717us 0.000us 0.00% 0.000us 0.000us 24
4084
+ aten::empty_like 0.34% 19.178us 1.51% 84.829us 5.655us 0.000us 0.00% 0.000us 0.000us 15
4085
+ aten::empty 1.41% 78.973us 1.41% 78.973us 3.291us 0.000us 0.00% 0.000us 0.000us 24
4086
+ cudaLaunchKernel 5.72% 320.465us 5.72% 320.465us 21.364us 0.000us 0.00% 0.000us 0.000us 15
4087
+ aten::empty_strided 0.27% 15.229us 0.27% 15.229us 5.076us 0.000us 0.00% 0.000us 0.000us 3
4088
+ cudaDeviceGetAttribute 0.04% 2.110us 0.04% 2.110us 0.352us 0.000us 0.00% 0.000us 0.000us 6
4089
+ cudaFuncSetAttribute 0.07% 4.130us 0.07% 4.130us 1.377us 0.000us 0.00% 0.000us 0.000us 3
4090
+ cudaDeviceSynchronize 56.62% 3.175ms 56.62% 3.175ms 3.175ms 0.000us 0.00% 0.000us 0.000us 1
4091
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
+ Self CPU time total: 5.607ms
4093
+ Self CUDA time total: 3.864ms
4094
 
4095
 
4096
 
 
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
+ torch_flash_ma 5.31% 318.398us 40.52% 2.428ms 2.428ms 0.000us 0.00% 4.370ms 4.370ms 1
4104
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.321ms 100.25% 4.321ms 4.321ms 1
4105
+ aten::scaled_dot_product_attention 0.43% 25.890us 3.27% 195.733us 65.244us 0.000us 0.00% 3.503ms 1.168ms 3
4106
+ aten::_scaled_dot_product_flash_attention 0.32% 19.430us 2.83% 169.843us 56.614us 0.000us 0.00% 3.503ms 1.168ms 3
4107
+ aten::_flash_attention_forward 0.75% 44.733us 2.13% 127.534us 42.511us 3.503ms 81.28% 3.503ms 1.168ms 3
4108
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.503ms 81.28% 3.503ms 1.168ms 3
4109
+ aten::contiguous 0.16% 9.533us 31.15% 1.866ms 155.517us 0.000us 0.00% 867.131us 72.261us 12
4110
+ aten::clone 0.48% 28.649us 30.99% 1.857ms 154.722us 0.000us 0.00% 867.131us 72.261us 12
4111
+ aten::copy_ 1.37% 82.103us 29.43% 1.763ms 146.944us 806.940us 18.72% 867.131us 72.261us 12
4112
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 806.940us 18.72% 806.940us 67.245us 12
4113
+ Activity Buffer Request 23.90% 1.432ms 23.90% 1.432ms 1.432ms 60.191us 1.40% 60.191us 60.191us 1
4114
+ aten::transpose 0.87% 52.328us 1.17% 70.130us 2.922us 0.000us 0.00% 0.000us 0.000us 24
4115
+ aten::as_strided 0.30% 17.802us 0.30% 17.802us 0.742us 0.000us 0.00% 0.000us 0.000us 24
4116
+ aten::empty_like 0.33% 20.052us 1.44% 86.062us 5.737us 0.000us 0.00% 0.000us 0.000us 15
4117
+ aten::empty 1.32% 79.270us 1.32% 79.270us 3.303us 0.000us 0.00% 0.000us 0.000us 24
4118
+ cudaLaunchKernel 4.58% 274.314us 4.58% 274.314us 18.288us 0.000us 0.00% 0.000us 0.000us 15
4119
+ aten::empty_strided 0.27% 16.430us 0.27% 16.430us 5.477us 0.000us 0.00% 0.000us 0.000us 3
4120
+ cudaDeviceGetAttribute 0.04% 2.360us 0.04% 2.360us 0.393us 0.000us 0.00% 0.000us 0.000us 6
4121
+ cudaFuncSetAttribute 0.07% 4.210us 0.07% 4.210us 1.403us 0.000us 0.00% 0.000us 0.000us 3
4122
+ cudaDeviceSynchronize 59.48% 3.564ms 59.48% 3.564ms 3.564ms 0.000us 0.00% 0.000us 0.000us 1
4123
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4124
+ Self CPU time total: 5.991ms
4125
+ Self CUDA time total: 4.310ms
4126
 
4127
 
4128
 
 
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
+ torch_flash_ma 3.92% 237.516us 38.06% 2.305ms 2.305ms 0.000us 0.00% 4.487ms 4.487ms 1
4136
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.438ms 100.25% 4.438ms 4.438ms 1
4137
+ aten::scaled_dot_product_attention 0.44% 26.369us 3.02% 182.943us 60.981us 0.000us 0.00% 3.605ms 1.202ms 3
4138
+ aten::_scaled_dot_product_flash_attention 0.31% 18.541us 2.59% 156.574us 52.191us 0.000us 0.00% 3.605ms 1.202ms 3
4139
+ aten::_flash_attention_forward 0.63% 38.112us 1.91% 115.882us 38.627us 3.605ms 81.43% 3.605ms 1.202ms 3
4140
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.605ms 81.43% 3.605ms 1.202ms 3
4141
+ aten::contiguous 0.15% 9.281us 30.31% 1.836ms 153.003us 0.000us 0.00% 882.684us 73.557us 12
4142
+ aten::clone 0.47% 28.328us 30.16% 1.827ms 152.229us 0.000us 0.00% 882.684us 73.557us 12
4143
+ aten::copy_ 1.32% 79.871us 28.64% 1.734ms 144.531us 822.268us 18.57% 882.684us 73.557us 12
4144
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 822.268us 18.57% 822.268us 68.522us 12
4145
+ Activity Buffer Request 23.38% 1.416ms 23.38% 1.416ms 1.416ms 60.416us 1.36% 60.416us 60.416us 1
4146
+ aten::transpose 0.89% 53.992us 1.17% 70.941us 2.956us 0.000us 0.00% 0.000us 0.000us 24
4147
+ aten::as_strided 0.28% 16.949us 0.28% 16.949us 0.706us 0.000us 0.00% 0.000us 0.000us 24
4148
+ aten::empty_like 0.33% 19.985us 1.39% 84.474us 5.632us 0.000us 0.00% 0.000us 0.000us 15
4149
+ aten::empty 1.27% 76.679us 1.27% 76.679us 3.195us 0.000us 0.00% 0.000us 0.000us 24
4150
+ cudaLaunchKernel 4.33% 262.156us 4.33% 262.156us 17.477us 0.000us 0.00% 0.000us 0.000us 15
4151
+ aten::empty_strided 0.26% 15.620us 0.26% 15.620us 5.207us 0.000us 0.00% 0.000us 0.000us 3
4152
+ cudaDeviceGetAttribute 0.04% 2.329us 0.04% 2.329us 0.388us 0.000us 0.00% 0.000us 0.000us 6
4153
+ cudaFuncSetAttribute 0.06% 3.781us 0.06% 3.781us 1.260us 0.000us 0.00% 0.000us 0.000us 3
4154
+ cudaDeviceSynchronize 61.94% 3.751ms 61.94% 3.751ms 3.751ms 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
+ Self CPU time total: 6.057ms
4157
+ Self CUDA time total: 4.427ms
4158
 
4159
 
4160
  impl wl p50(ms) ok
4161
  torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4162
+ torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
4163
  torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4164
  torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4165
+ torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4166
+ torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
4167
  </pre></div>
4168
  <div class="cell-artifacts">
4169
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 5.88s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
- hf_kernels_flash_attn 3.54% 153.223us 41.10% 1.781ms 1.781ms 0.000us 0.00% 3.710ms 3.710ms 1
3930
- _flash_attn_9e27194::fwd 1.64% 71.013us 37.57% 1.628ms 542.522us 2.765ms 100.00% 3.710ms 1.237ms 3
3931
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.766ms 100.05% 2.766ms 2.766ms 1
3932
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.765ms 100.00% 2.765ms 921.626us 3
3933
- Activity Buffer Request 32.85% 1.423ms 32.85% 1.423ms 1.423ms 945.530us 34.20% 945.530us 945.530us 1
3934
- cudaDeviceGetAttribute 0.11% 4.920us 0.11% 4.920us 0.328us 0.000us 0.00% 0.000us 0.000us 15
3935
- aten::empty_like 0.37% 16.201us 1.19% 51.582us 17.194us 0.000us 0.00% 0.000us 0.000us 3
3936
- aten::empty_strided 0.82% 35.381us 0.82% 35.381us 11.794us 0.000us 0.00% 0.000us 0.000us 3
3937
- aten::empty 0.55% 23.891us 0.55% 23.891us 2.655us 0.000us 0.00% 0.000us 0.000us 9
3938
- cudaFuncSetAttribute 0.27% 11.501us 0.27% 11.501us 3.834us 0.000us 0.00% 0.000us 0.000us 3
3939
- cudaLaunchKernel 0.96% 41.661us 0.96% 41.661us 13.887us 0.000us 0.00% 0.000us 0.000us 3
3940
- cudaDeviceSynchronize 58.90% 2.552ms 58.90% 2.552ms 2.552ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
- Self CPU time total: 4.332ms
3943
- Self CUDA time total: 2.765ms
3944
 
3945
 
3946
 
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
- hf_kernels_flash_attn 1.95% 87.173us 36.43% 1.628ms 1.628ms 0.000us 0.00% 3.993ms 3.993ms 1
3954
- _flash_attn_9e27194::fwd 1.10% 49.286us 34.48% 1.541ms 513.554us 2.982ms 100.00% 3.993ms 1.331ms 3
3955
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.984ms 100.06% 2.984ms 2.984ms 1
3956
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.982ms 100.00% 2.982ms 993.983us 3
3957
- Activity Buffer Request 31.65% 1.414ms 31.65% 1.414ms 1.414ms 1.011ms 33.92% 1.011ms 1.011ms 1
3958
- cudaDeviceGetAttribute 0.09% 3.827us 0.09% 3.827us 0.255us 0.000us 0.00% 0.000us 0.000us 15
3959
- aten::empty_like 0.16% 7.330us 0.51% 22.831us 7.610us 0.000us 0.00% 0.000us 0.000us 3
3960
- aten::empty_strided 0.35% 15.501us 0.35% 15.501us 5.167us 0.000us 0.00% 0.000us 0.000us 3
3961
- aten::empty 0.46% 20.669us 0.46% 20.669us 2.297us 0.000us 0.00% 0.000us 0.000us 9
3962
- cudaFuncSetAttribute 0.08% 3.520us 0.08% 3.520us 1.173us 0.000us 0.00% 0.000us 0.000us 3
3963
- cudaLaunchKernel 0.59% 26.211us 0.59% 26.211us 8.737us 0.000us 0.00% 0.000us 0.000us 3
3964
- cudaDeviceSynchronize 63.57% 2.841ms 63.57% 2.841ms 2.841ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
- Self CPU time total: 4.469ms
3967
- Self CUDA time total: 2.982ms
3968
 
3969
 
3970
 
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
- hf_kernels_flash_attn 2.39% 107.943us 36.87% 1.664ms 1.664ms 0.000us 0.00% 4.011ms 4.011ms 1
3978
- _flash_attn_9e27194::fwd 1.08% 48.663us 34.47% 1.556ms 518.528us 2.994ms 100.00% 4.011ms 1.337ms 3
3979
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.996ms 100.05% 2.996ms 2.996ms 1
3980
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.994ms 100.00% 2.994ms 998.054us 3
3981
- Activity Buffer Request 31.64% 1.428ms 31.64% 1.428ms 1.428ms 1.017ms 33.96% 1.017ms 1.017ms 1
3982
- cudaDeviceGetAttribute 0.09% 4.050us 0.09% 4.050us 0.270us 0.000us 0.00% 0.000us 0.000us 15
3983
- aten::empty_like 0.16% 7.029us 0.54% 24.521us 8.174us 0.000us 0.00% 0.000us 0.000us 3
3984
- aten::empty_strided 0.39% 17.492us 0.39% 17.492us 5.831us 0.000us 0.00% 0.000us 0.000us 3
3985
- aten::empty 0.46% 20.589us 0.46% 20.589us 2.288us 0.000us 0.00% 0.000us 0.000us 9
3986
- cudaFuncSetAttribute 0.08% 3.660us 0.08% 3.660us 1.220us 0.000us 0.00% 0.000us 0.000us 3
3987
- cudaLaunchKernel 0.59% 26.452us 0.59% 26.452us 8.817us 0.000us 0.00% 0.000us 0.000us 3
3988
- cudaDeviceSynchronize 63.13% 2.849ms 63.13% 2.849ms 2.849ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- Self CPU time total: 4.512ms
3991
- Self CUDA time total: 2.994ms
3992
 
3993
 
3994
 
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
- hf_kernels_flash_attn 2.37% 113.154us 39.04% 1.864ms 1.864ms 0.000us 0.00% 4.086ms 4.086ms 1
4002
- _flash_attn_9e27194::fwd 1.02% 48.863us 36.67% 1.751ms 583.543us 3.059ms 100.00% 4.086ms 1.362ms 3
4003
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.060ms 100.05% 3.060ms 3.060ms 1
4004
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.059ms 100.00% 3.059ms 1.020ms 3
4005
- Activity Buffer Request 29.92% 1.429ms 29.92% 1.429ms 1.429ms 1.027ms 33.57% 1.027ms 1.027ms 1
4006
- cudaDeviceGetAttribute 0.08% 3.821us 0.08% 3.821us 0.255us 0.000us 0.00% 0.000us 0.000us 15
4007
- aten::empty_like 0.16% 7.819us 0.54% 25.920us 8.640us 0.000us 0.00% 0.000us 0.000us 3
4008
- aten::empty_strided 0.38% 18.101us 0.38% 18.101us 6.034us 0.000us 0.00% 0.000us 0.000us 3
4009
- aten::empty 0.44% 21.109us 0.44% 21.109us 2.345us 0.000us 0.00% 0.000us 0.000us 9
4010
- cudaFuncSetAttribute 0.08% 3.840us 0.08% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3
4011
- cudaLaunchKernel 4.58% 218.538us 4.58% 218.538us 72.846us 0.000us 0.00% 0.000us 0.000us 3
4012
- cudaDeviceSynchronize 60.96% 2.910ms 60.96% 2.910ms 2.910ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
- Self CPU time total: 4.774ms
4015
- Self CUDA time total: 3.059ms
4016
 
4017
 
4018
 
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- hf_kernels_flash_attn 2.11% 109.115us 34.87% 1.804ms 1.804ms 0.000us 0.00% 4.702ms 4.702ms 1
4026
- _flash_attn_9e27194::fwd 0.94% 48.879us 32.76% 1.695ms 565.076us 3.518ms 100.00% 4.702ms 1.567ms 3
4027
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.519ms 100.04% 3.519ms 3.519ms 1
4028
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.518ms 100.00% 3.518ms 1.173ms 3
4029
- Activity Buffer Request 27.57% 1.427ms 27.57% 1.427ms 1.427ms 1.184ms 33.66% 1.184ms 1.184ms 1
4030
- cudaDeviceGetAttribute 0.07% 3.810us 0.07% 3.810us 0.254us 0.000us 0.00% 0.000us 0.000us 15
4031
- aten::empty_like 0.14% 7.040us 0.48% 25.061us 8.354us 0.000us 0.00% 0.000us 0.000us 3
4032
- aten::empty_strided 0.35% 18.021us 0.35% 18.021us 6.007us 0.000us 0.00% 0.000us 0.000us 3
4033
- aten::empty 0.40% 20.762us 0.40% 20.762us 2.307us 0.000us 0.00% 0.000us 0.000us 9
4034
- cudaFuncSetAttribute 0.07% 3.731us 0.07% 3.731us 1.244us 0.000us 0.00% 0.000us 0.000us 3
4035
- cudaLaunchKernel 3.21% 166.285us 3.21% 166.285us 55.428us 0.000us 0.00% 0.000us 0.000us 3
4036
- cudaDeviceSynchronize 65.13% 3.370ms 65.13% 3.370ms 3.370ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- Self CPU time total: 5.175ms
4039
- Self CUDA time total: 3.518ms
4040
 
4041
 
4042
 
@@ -4046,35 +4046,35 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- hf_kernels_flash_attn 2.00% 105.404us 33.86% 1.781ms 1.781ms 0.000us 0.00% 4.846ms 4.846ms 1
4050
- _flash_attn_9e27194::fwd 0.97% 50.822us 31.86% 1.675ms 558.446us 3.623ms 100.00% 4.846ms 1.615ms 3
4051
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.624ms 100.04% 3.624ms 3.624ms 1
4052
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.623ms 100.00% 3.623ms 1.208ms 3
4053
- Activity Buffer Request 26.72% 1.405ms 26.72% 1.405ms 1.405ms 1.223ms 33.77% 1.223ms 1.223ms 1
4054
- cudaDeviceGetAttribute 0.08% 4.369us 0.08% 4.369us 0.291us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_like 0.15% 7.679us 0.48% 25.141us 8.380us 0.000us 0.00% 0.000us 0.000us 3
4056
- aten::empty_strided 0.33% 17.462us 0.33% 17.462us 5.821us 0.000us 0.00% 0.000us 0.000us 3
4057
- aten::empty 0.40% 21.081us 0.40% 21.081us 2.342us 0.000us 0.00% 0.000us 0.000us 9
4058
- cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3
4059
- cudaLaunchKernel 3.13% 164.746us 3.13% 164.746us 54.915us 0.000us 0.00% 0.000us 0.000us 3
4060
- cudaDeviceSynchronize 66.14% 3.478ms 66.14% 3.478ms 3.478ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
- Self CPU time total: 5.259ms
4063
- Self CUDA time total: 3.623ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
  hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4068
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
4069
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
4070
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4071
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4072
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
4073
  </pre></div>
4074
  <div class="cell-stderr">
4075
  Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4076
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:16, 1.12it/s]
4077
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.15it/s]
4078
  </div>
4079
  <div class="cell-artifacts">
4080
  <h4>Artifacts:</h4>
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 6.00s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
+ hf_kernels_flash_attn 3.72% 162.003us 41.36% 1.801ms 1.801ms 0.000us 0.00% 3.718ms 3.718ms 1
3930
+ _flash_attn_9e27194::fwd 1.69% 73.411us 37.64% 1.639ms 546.409us 2.775ms 100.00% 3.718ms 1.239ms 3
3931
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.777ms 100.05% 2.777ms 2.777ms 1
3932
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.775ms 100.00% 2.775ms 925.087us 3
3933
+ Activity Buffer Request 32.85% 1.431ms 32.85% 1.431ms 1.431ms 943.102us 33.98% 943.102us 943.102us 1
3934
+ cudaDeviceGetAttribute 0.11% 4.701us 0.11% 4.701us 0.313us 0.000us 0.00% 0.000us 0.000us 15
3935
+ aten::empty_like 0.40% 17.630us 1.19% 51.921us 17.307us 0.000us 0.00% 0.000us 0.000us 3
3936
+ aten::empty_strided 0.79% 34.291us 0.79% 34.291us 11.430us 0.000us 0.00% 0.000us 0.000us 3
3937
+ aten::empty 0.58% 25.250us 0.58% 25.250us 2.806us 0.000us 0.00% 0.000us 0.000us 9
3938
+ cudaFuncSetAttribute 0.29% 12.441us 0.29% 12.441us 4.147us 0.000us 0.00% 0.000us 0.000us 3
3939
+ cudaLaunchKernel 0.94% 40.982us 0.94% 40.982us 13.661us 0.000us 0.00% 0.000us 0.000us 3
3940
+ cudaDeviceSynchronize 58.64% 2.554ms 58.64% 2.554ms 2.554ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
+ Self CPU time total: 4.355ms
3943
+ Self CUDA time total: 2.775ms
3944
 
3945
 
3946
 
 
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
+ hf_kernels_flash_attn 2.15% 96.184us 36.83% 1.645ms 1.645ms 0.000us 0.00% 3.965ms 3.965ms 1
3954
+ _flash_attn_9e27194::fwd 1.07% 47.845us 34.67% 1.549ms 516.264us 2.974ms 100.00% 3.965ms 1.322ms 3
3955
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.975ms 100.05% 2.975ms 2.975ms 1
3956
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.974ms 100.00% 2.974ms 991.313us 3
3957
+ Activity Buffer Request 31.80% 1.421ms 31.80% 1.421ms 1.421ms 990.779us 33.32% 990.779us 990.779us 1
3958
+ cudaDeviceGetAttribute 0.08% 3.723us 0.08% 3.723us 0.248us 0.000us 0.00% 0.000us 0.000us 15
3959
+ aten::empty_like 0.15% 6.890us 0.53% 23.451us 7.817us 0.000us 0.00% 0.000us 0.000us 3
3960
+ aten::empty_strided 0.37% 16.561us 0.37% 16.561us 5.520us 0.000us 0.00% 0.000us 0.000us 3
3961
+ aten::empty 0.50% 22.171us 0.50% 22.171us 2.463us 0.000us 0.00% 0.000us 0.000us 9
3962
+ cudaFuncSetAttribute 0.09% 3.911us 0.09% 3.911us 1.304us 0.000us 0.00% 0.000us 0.000us 3
3963
+ cudaLaunchKernel 0.61% 27.040us 0.61% 27.040us 9.013us 0.000us 0.00% 0.000us 0.000us 3
3964
+ cudaDeviceSynchronize 63.17% 2.822ms 63.17% 2.822ms 2.822ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
+ Self CPU time total: 4.467ms
3967
+ Self CUDA time total: 2.974ms
3968
 
3969
 
3970
 
 
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
+ hf_kernels_flash_attn 2.41% 109.001us 36.55% 1.652ms 1.652ms 0.000us 0.00% 4.036ms 4.036ms 1
3978
+ _flash_attn_9e27194::fwd 1.11% 50.180us 34.14% 1.543ms 514.365us 3.018ms 100.00% 4.036ms 1.345ms 3
3979
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.019ms 100.05% 3.019ms 3.019ms 1
3980
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.018ms 100.00% 3.018ms 1.006ms 3
3981
+ Activity Buffer Request 31.22% 1.411ms 31.22% 1.411ms 1.411ms 1.018ms 33.73% 1.018ms 1.018ms 1
3982
+ cudaDeviceGetAttribute 0.08% 3.790us 0.08% 3.790us 0.253us 0.000us 0.00% 0.000us 0.000us 15
3983
+ aten::empty_like 0.16% 7.151us 0.52% 23.401us 7.800us 0.000us 0.00% 0.000us 0.000us 3
3984
+ aten::empty_strided 0.36% 16.250us 0.36% 16.250us 5.417us 0.000us 0.00% 0.000us 0.000us 3
3985
+ aten::empty 0.48% 21.660us 0.48% 21.660us 2.407us 0.000us 0.00% 0.000us 0.000us 9
3986
+ cudaFuncSetAttribute 0.10% 4.380us 0.10% 4.380us 1.460us 0.000us 0.00% 0.000us 0.000us 3
3987
+ cudaLaunchKernel 0.64% 28.812us 0.64% 28.812us 9.604us 0.000us 0.00% 0.000us 0.000us 3
3988
+ cudaDeviceSynchronize 63.45% 2.868ms 63.45% 2.868ms 2.868ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ Self CPU time total: 4.520ms
3991
+ Self CUDA time total: 3.018ms
3992
 
3993
 
3994
 
 
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ hf_kernels_flash_attn 2.47% 118.264us 38.70% 1.854ms 1.854ms 0.000us 0.00% 4.130ms 4.130ms 1
4002
+ _flash_attn_9e27194::fwd 1.01% 48.470us 36.23% 1.735ms 578.465us 3.094ms 100.00% 4.130ms 1.377ms 3
4003
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.096ms 100.05% 3.096ms 3.096ms 1
4004
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 100.00% 3.094ms 1.031ms 3
4005
+ Activity Buffer Request 29.33% 1.405ms 29.33% 1.405ms 1.405ms 1.036ms 33.49% 1.036ms 1.036ms 1
4006
+ cudaDeviceGetAttribute 0.08% 3.720us 0.08% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15
4007
+ aten::empty_like 0.16% 7.520us 0.53% 25.440us 8.480us 0.000us 0.00% 0.000us 0.000us 3
4008
+ aten::empty_strided 0.37% 17.920us 0.37% 17.920us 5.973us 0.000us 0.00% 0.000us 0.000us 3
4009
+ aten::empty 0.43% 20.670us 0.43% 20.670us 2.297us 0.000us 0.00% 0.000us 0.000us 9
4010
+ cudaFuncSetAttribute 0.08% 4.010us 0.08% 4.010us 1.337us 0.000us 0.00% 0.000us 0.000us 3
4011
+ cudaLaunchKernel 4.76% 227.935us 4.76% 227.935us 75.978us 0.000us 0.00% 0.000us 0.000us 3
4012
+ cudaDeviceSynchronize 61.30% 2.937ms 61.30% 2.937ms 2.937ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
+ Self CPU time total: 4.790ms
4015
+ Self CUDA time total: 3.094ms
4016
 
4017
 
4018
 
 
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
+ hf_kernels_flash_attn 2.07% 110.462us 34.39% 1.835ms 1.835ms 0.000us 0.00% 4.876ms 4.876ms 1
4026
+ _flash_attn_9e27194::fwd 0.91% 48.552us 32.32% 1.724ms 574.769us 3.652ms 100.00% 4.876ms 1.625ms 3
4027
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.05% 3.654ms 3.654ms 1
4028
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
4029
+ Activity Buffer Request 27.00% 1.440ms 27.00% 1.440ms 1.440ms 1.224ms 33.53% 1.224ms 1.224ms 1
4030
+ cudaDeviceGetAttribute 0.07% 3.831us 0.07% 3.831us 0.255us 0.000us 0.00% 0.000us 0.000us 15
4031
+ aten::empty_like 0.15% 7.880us 0.47% 24.970us 8.323us 0.000us 0.00% 0.000us 0.000us 3
4032
+ aten::empty_strided 0.32% 17.090us 0.32% 17.090us 5.697us 0.000us 0.00% 0.000us 0.000us 3
4033
+ aten::empty 0.44% 23.410us 0.44% 23.410us 2.601us 0.000us 0.00% 0.000us 0.000us 9
4034
+ cudaFuncSetAttribute 0.08% 4.110us 0.08% 4.110us 1.370us 0.000us 0.00% 0.000us 0.000us 3
4035
+ cudaLaunchKernel 3.36% 179.284us 3.36% 179.284us 59.761us 0.000us 0.00% 0.000us 0.000us 3
4036
+ cudaDeviceSynchronize 65.61% 3.500ms 65.61% 3.500ms 3.500ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ Self CPU time total: 5.335ms
4039
+ Self CUDA time total: 3.652ms
4040
 
4041
 
4042
 
 
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ hf_kernels_flash_attn 2.06% 108.982us 33.74% 1.784ms 1.784ms 0.000us 0.00% 4.883ms 4.883ms 1
4050
+ _flash_attn_9e27194::fwd 0.92% 48.842us 31.68% 1.675ms 558.369us 3.652ms 100.00% 4.883ms 1.628ms 3
4051
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.04% 3.654ms 3.654ms 1
4052
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
4053
+ Activity Buffer Request 26.57% 1.405ms 26.57% 1.405ms 1.405ms 1.231ms 33.70% 1.231ms 1.231ms 1
4054
+ cudaDeviceGetAttribute 0.07% 3.720us 0.07% 3.720us 0.248us 0.000us 0.00% 0.000us 0.000us 15
4055
+ aten::empty_like 0.14% 7.460us 0.45% 23.940us 7.980us 0.000us 0.00% 0.000us 0.000us 3
4056
+ aten::empty_strided 0.31% 16.480us 0.31% 16.480us 5.493us 0.000us 0.00% 0.000us 0.000us 3
4057
+ aten::empty 0.43% 22.601us 0.43% 22.601us 2.511us 0.000us 0.00% 0.000us 0.000us 9
4058
+ cudaFuncSetAttribute 0.07% 3.610us 0.07% 3.610us 1.203us 0.000us 0.00% 0.000us 0.000us 3
4059
+ cudaLaunchKernel 3.17% 167.603us 3.17% 167.603us 55.868us 0.000us 0.00% 0.000us 0.000us 3
4060
+ cudaDeviceSynchronize 66.26% 3.504ms 66.26% 3.504ms 3.504ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
+ Self CPU time total: 5.288ms
4063
+ Self CUDA time total: 3.652ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
  hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4068
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4069
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4070
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
4071
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4072
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
4073
  </pre></div>
4074
  <div class="cell-stderr">
4075
  Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4076
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:14, 1.21it/s]
4077
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 12.08it/s]
4078
  </div>
4079
  <div class="cell-artifacts">
4080
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 6.50s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
- hf_kernels_flash_attn3 3.80% 163.585us 44.55% 1.916ms 1.916ms 0.000us 0.00% 3.598ms 3.598ms 1
3929
- FlashAttnFunc 3.38% 145.315us 40.75% 1.753ms 584.213us 0.000us 0.00% 3.598ms 1.199ms 3
3930
- _flash_attn3_48fe103_dirty::fwd 1.86% 80.133us 37.37% 1.607ms 535.775us 2.702ms 100.00% 3.598ms 1.199ms 3
3931
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.704ms 100.05% 2.704ms 2.704ms 1
3932
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.702ms 100.00% 2.702ms 900.800us 3
3933
- Activity Buffer Request 33.08% 1.423ms 33.08% 1.423ms 1.423ms 895.776us 33.15% 895.776us 895.776us 1
3934
- aten::empty 1.02% 43.812us 1.02% 43.812us 7.302us 0.000us 0.00% 0.000us 0.000us 6
3935
- cudaFuncSetAttribute 0.30% 13.081us 0.30% 13.081us 4.360us 0.000us 0.00% 0.000us 0.000us 3
3936
- cudaLaunchKernel 1.10% 47.211us 1.10% 47.211us 15.737us 0.000us 0.00% 0.000us 0.000us 3
3937
- cudaDeviceSynchronize 55.45% 2.385ms 55.45% 2.385ms 2.385ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 4.301ms
3940
- Self CUDA time total: 2.702ms
3941
 
3942
 
3943
 
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- hf_kernels_flash_attn3 2.35% 101.013us 40.06% 1.725ms 1.725ms 0.000us 0.00% 3.751ms 3.751ms 1
3951
- FlashAttnFunc 2.16% 92.983us 37.71% 1.624ms 541.352us 0.000us 0.00% 3.751ms 1.250ms 3
3952
- _flash_attn3_48fe103_dirty::fwd 1.19% 51.175us 35.55% 1.531ms 510.358us 2.802ms 100.00% 3.751ms 1.250ms 3
3953
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.803ms 100.06% 2.803ms 2.803ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.802ms 100.00% 2.802ms 933.921us 3
3955
- Activity Buffer Request 32.90% 1.417ms 32.90% 1.417ms 1.417ms 949.686us 33.90% 949.686us 949.686us 1
3956
- aten::empty 0.63% 27.091us 0.63% 27.091us 4.515us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.12% 5.239us 0.12% 5.239us 1.746us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.72% 30.870us 0.72% 30.870us 10.290us 0.000us 0.00% 0.000us 0.000us 3
3959
- cudaDeviceSynchronize 59.94% 2.581ms 59.94% 2.581ms 2.581ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
- Self CPU time total: 4.306ms
3962
- Self CUDA time total: 2.802ms
3963
 
3964
 
3965
 
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
- hf_kernels_flash_attn3 2.33% 100.994us 40.09% 1.739ms 1.739ms 0.000us 0.00% 3.778ms 3.778ms 1
3973
- FlashAttnFunc 2.19% 94.944us 37.76% 1.638ms 545.852us 0.000us 0.00% 3.778ms 1.259ms 3
3974
- _flash_attn3_48fe103_dirty::fwd 1.20% 52.112us 35.57% 1.543ms 514.204us 2.819ms 100.00% 3.778ms 1.259ms 3
3975
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.820ms 100.05% 2.820ms 2.820ms 1
3976
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.819ms 100.00% 2.819ms 939.550us 3
3977
- Activity Buffer Request 32.79% 1.422ms 32.79% 1.422ms 1.422ms 959.198us 34.03% 959.198us 959.198us 1
3978
- aten::empty 0.60% 26.051us 0.60% 26.051us 4.342us 0.000us 0.00% 0.000us 0.000us 6
3979
- cudaFuncSetAttribute 0.12% 5.409us 0.12% 5.409us 1.803us 0.000us 0.00% 0.000us 0.000us 3
3980
- cudaLaunchKernel 0.85% 36.931us 0.85% 36.931us 12.310us 0.000us 0.00% 0.000us 0.000us 3
3981
- cudaDeviceSynchronize 59.91% 2.599ms 59.91% 2.599ms 2.599ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
- Self CPU time total: 4.337ms
3984
- Self CUDA time total: 2.819ms
3985
 
3986
 
3987
 
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
- hf_kernels_flash_attn3 2.88% 135.094us 43.08% 2.020ms 2.020ms 0.000us 0.00% 3.874ms 3.874ms 1
3995
- FlashAttnFunc 2.10% 98.504us 40.20% 1.885ms 628.185us 0.000us 0.00% 3.874ms 1.291ms 3
3996
- _flash_attn3_48fe103_dirty::fwd 1.10% 51.632us 38.10% 1.786ms 595.350us 2.895ms 100.00% 3.874ms 1.291ms 3
3997
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.897ms 100.06% 2.897ms 2.897ms 1
3998
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.895ms 100.00% 2.895ms 965.011us 3
3999
- Activity Buffer Request 30.58% 1.434ms 30.58% 1.434ms 1.434ms 979.229us 33.82% 979.229us 979.229us 1
4000
- aten::empty 0.58% 27.080us 0.58% 27.080us 4.513us 0.000us 0.00% 0.000us 0.000us 6
4001
- cudaFuncSetAttribute 0.11% 5.380us 0.11% 5.380us 1.793us 0.000us 0.00% 0.000us 0.000us 3
4002
- cudaLaunchKernel 5.72% 268.289us 5.72% 268.289us 89.430us 0.000us 0.00% 0.000us 0.000us 3
4003
- cudaDeviceSynchronize 56.92% 2.668ms 56.92% 2.668ms 2.668ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
- Self CPU time total: 4.688ms
4006
- Self CUDA time total: 2.895ms
4007
 
4008
 
4009
 
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
- hf_kernels_flash_attn3 2.52% 128.963us 37.26% 1.903ms 1.903ms 0.000us 0.00% 4.575ms 4.575ms 1
4017
- FlashAttnFunc 1.87% 95.425us 34.74% 1.774ms 591.441us 0.000us 0.00% 4.575ms 1.525ms 3
4018
- _flash_attn3_48fe103_dirty::fwd 1.01% 51.593us 32.87% 1.679ms 559.632us 3.427ms 100.00% 4.575ms 1.525ms 3
4019
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.429ms 100.05% 3.429ms 3.429ms 1
4020
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.427ms 100.00% 3.427ms 1.142ms 3
4021
- Activity Buffer Request 27.82% 1.421ms 27.82% 1.421ms 1.421ms 1.148ms 33.49% 1.148ms 1.148ms 1
4022
- aten::empty 0.55% 28.251us 0.55% 28.251us 4.709us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaFuncSetAttribute 0.10% 5.249us 0.10% 5.249us 1.750us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaLaunchKernel 3.38% 172.866us 3.38% 172.866us 57.622us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaDeviceSynchronize 62.74% 3.205ms 62.74% 3.205ms 3.205ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
- Self CPU time total: 5.108ms
4028
- Self CUDA time total: 3.427ms
4029
 
4030
 
4031
 
@@ -4035,40 +4035,35 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- hf_kernels_flash_attn3 2.37% 119.165us 36.69% 1.842ms 1.842ms 0.000us 0.00% 4.545ms 4.545ms 1
4039
- FlashAttnFunc 1.86% 93.463us 34.32% 1.723ms 574.423us 0.000us 0.00% 4.545ms 1.515ms 3
4040
- _flash_attn3_48fe103_dirty::fwd 1.01% 50.561us 32.46% 1.630ms 543.268us 3.398ms 100.00% 4.545ms 1.515ms 3
4041
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.400ms 100.05% 3.400ms 3.400ms 1
4042
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
4043
- Activity Buffer Request 27.47% 1.379ms 27.47% 1.379ms 1.379ms 1.147ms 33.76% 1.147ms 1.147ms 1
4044
- aten::empty 0.56% 28.202us 0.56% 28.202us 4.700us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaFuncSetAttribute 0.10% 5.090us 0.10% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaLaunchKernel 3.32% 166.515us 3.32% 166.515us 55.505us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaDeviceSynchronize 63.31% 3.179ms 63.31% 3.179ms 3.179ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- Self CPU time total: 5.022ms
4050
- Self CUDA time total: 3.398ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
  hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
4055
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
4056
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4057
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4058
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4059
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4060
  </pre></div>
4061
- <div class="uv-install-logs" id="uv-logs-benchmark">
4062
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4063
- <div class="uv-logs-content" style="display: none;">
4064
- Downloading hf-xet (3.2MiB)
4065
- Downloading hf-xet
4066
- Installed 15 packages in 15ms
4067
  </div>
4068
- </div>
4069
- <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4070
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.06it/s]
4071
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.12it/s]</div>
4072
  <div class="cell-artifacts">
4073
  <h4>Artifacts:</h4>
4074
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.51s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
+ hf_kernels_flash_attn3 3.90% 167.264us 45.75% 1.964ms 1.964ms 0.000us 0.00% 3.551ms 3.551ms 1
3929
+ FlashAttnFunc 3.34% 143.492us 41.85% 1.797ms 598.836us 0.000us 0.00% 3.551ms 1.184ms 3
3930
+ _flash_attn3_48fe103_dirty::fwd 1.86% 80.044us 38.51% 1.653ms 551.005us 2.654ms 100.00% 3.551ms 1.184ms 3
3931
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.655ms 100.05% 2.655ms 2.655ms 1
3932
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.654ms 100.00% 2.654ms 884.532us 3
3933
+ Activity Buffer Request 34.19% 1.468ms 34.19% 1.468ms 1.468ms 897.822us 33.83% 897.822us 897.822us 1
3934
+ aten::empty 1.09% 46.590us 1.09% 46.590us 7.765us 0.000us 0.00% 0.000us 0.000us 6
3935
+ cudaFuncSetAttribute 0.30% 12.680us 0.30% 12.680us 4.227us 0.000us 0.00% 0.000us 0.000us 3
3936
+ cudaLaunchKernel 1.07% 45.911us 1.07% 45.911us 15.304us 0.000us 0.00% 0.000us 0.000us 3
3937
+ cudaDeviceSynchronize 54.25% 2.329ms 54.25% 2.329ms 2.329ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
+ Self CPU time total: 4.293ms
3940
+ Self CUDA time total: 2.654ms
3941
 
3942
 
3943
 
 
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ hf_kernels_flash_attn3 2.52% 108.973us 40.29% 1.745ms 1.745ms 0.000us 0.00% 3.761ms 3.761ms 1
3951
+ FlashAttnFunc 2.11% 91.250us 37.77% 1.636ms 545.408us 0.000us 0.00% 3.761ms 1.254ms 3
3952
+ _flash_attn3_48fe103_dirty::fwd 1.23% 53.414us 35.67% 1.545ms 514.991us 2.811ms 100.00% 3.761ms 1.254ms 3
3953
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.813ms 100.05% 2.813ms 2.813ms 1
3954
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.811ms 100.00% 2.811ms 937.084us 3
3955
+ Activity Buffer Request 32.99% 1.429ms 32.99% 1.429ms 1.429ms 949.852us 33.79% 949.852us 949.852us 1
3956
+ aten::empty 0.64% 27.630us 0.64% 27.630us 4.605us 0.000us 0.00% 0.000us 0.000us 6
3957
+ cudaFuncSetAttribute 0.11% 4.980us 0.11% 4.980us 1.660us 0.000us 0.00% 0.000us 0.000us 3
3958
+ cudaLaunchKernel 0.69% 30.020us 0.69% 30.020us 10.007us 0.000us 0.00% 0.000us 0.000us 3
3959
+ cudaDeviceSynchronize 59.71% 2.587ms 59.71% 2.587ms 2.587ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
+ Self CPU time total: 4.332ms
3962
+ Self CUDA time total: 2.811ms
3963
 
3964
 
3965
 
 
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
+ hf_kernels_flash_attn3 2.50% 112.343us 38.92% 1.748ms 1.748ms 0.000us 0.00% 3.960ms 3.960ms 1
3973
+ FlashAttnFunc 2.05% 91.871us 36.42% 1.636ms 545.325us 0.000us 0.00% 3.960ms 1.320ms 3
3974
+ _flash_attn3_48fe103_dirty::fwd 1.14% 51.221us 34.37% 1.544ms 514.701us 2.972ms 100.00% 3.960ms 1.320ms 3
3975
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.973ms 100.05% 2.973ms 2.973ms 1
3976
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.972ms 100.00% 2.972ms 990.630us 3
3977
+ Activity Buffer Request 31.81% 1.429ms 31.81% 1.429ms 1.429ms 987.835us 33.24% 987.835us 987.835us 1
3978
+ aten::empty 0.63% 28.400us 0.63% 28.400us 4.733us 0.000us 0.00% 0.000us 0.000us 6
3979
+ cudaFuncSetAttribute 0.12% 5.211us 0.12% 5.211us 1.737us 0.000us 0.00% 0.000us 0.000us 3
3980
+ cudaLaunchKernel 0.67% 30.301us 0.67% 30.301us 10.100us 0.000us 0.00% 0.000us 0.000us 3
3981
+ cudaDeviceSynchronize 61.08% 2.744ms 61.08% 2.744ms 2.744ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
+ Self CPU time total: 4.492ms
3984
+ Self CUDA time total: 2.972ms
3985
 
3986
 
3987
 
 
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
+ hf_kernels_flash_attn3 2.16% 102.333us 40.97% 1.945ms 1.945ms 0.000us 0.00% 4.045ms 4.045ms 1
3995
+ FlashAttnFunc 1.95% 92.400us 38.81% 1.843ms 614.206us 0.000us 0.00% 4.045ms 1.348ms 3
3996
+ _flash_attn3_48fe103_dirty::fwd 1.07% 50.872us 36.87% 1.750ms 583.406us 3.024ms 100.00% 4.045ms 1.348ms 3
3997
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.026ms 100.05% 3.026ms 3.026ms 1
3998
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.024ms 100.00% 3.024ms 1.008ms 3
3999
+ Activity Buffer Request 29.88% 1.419ms 29.88% 1.419ms 1.419ms 1.021ms 33.76% 1.021ms 1.021ms 1
4000
+ aten::empty 0.61% 28.961us 0.61% 28.961us 4.827us 0.000us 0.00% 0.000us 0.000us 6
4001
+ cudaFuncSetAttribute 0.11% 5.320us 0.11% 5.320us 1.773us 0.000us 0.00% 0.000us 0.000us 3
4002
+ cudaLaunchKernel 5.19% 246.415us 5.19% 246.415us 82.138us 0.000us 0.00% 0.000us 0.000us 3
4003
+ cudaDeviceSynchronize 59.03% 2.803ms 59.03% 2.803ms 2.803ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
+ Self CPU time total: 4.747ms
4006
+ Self CUDA time total: 3.024ms
4007
 
4008
 
4009
 
 
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
+ hf_kernels_flash_attn3 2.48% 128.541us 37.35% 1.936ms 1.936ms 0.000us 0.00% 4.636ms 4.636ms 1
4017
+ FlashAttnFunc 1.81% 93.984us 34.87% 1.807ms 602.493us 0.000us 0.00% 4.636ms 1.545ms 3
4018
+ _flash_attn3_48fe103_dirty::fwd 0.96% 49.852us 33.05% 1.713ms 571.165us 3.473ms 100.00% 4.636ms 1.545ms 3
4019
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.475ms 100.05% 3.475ms 3.475ms 1
4020
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.473ms 100.00% 3.473ms 1.158ms 3
4021
+ Activity Buffer Request 27.80% 1.441ms 27.80% 1.441ms 1.441ms 1.163ms 33.49% 1.163ms 1.163ms 1
4022
+ aten::empty 0.57% 29.640us 0.57% 29.640us 4.940us 0.000us 0.00% 0.000us 0.000us 6
4023
+ cudaFuncSetAttribute 0.10% 5.160us 0.10% 5.160us 1.720us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaLaunchKernel 3.62% 187.873us 3.62% 187.873us 62.624us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaDeviceSynchronize 62.65% 3.248ms 62.65% 3.248ms 3.248ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ Self CPU time total: 5.184ms
4028
+ Self CUDA time total: 3.473ms
4029
 
4030
 
4031
 
 
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ hf_kernels_flash_attn3 2.41% 121.192us 36.39% 1.829ms 1.829ms 0.000us 0.00% 4.566ms 4.566ms 1
4039
+ FlashAttnFunc 1.84% 92.271us 33.97% 1.707ms 569.139us 0.000us 0.00% 4.566ms 1.522ms 3
4040
+ _flash_attn3_48fe103_dirty::fwd 1.00% 50.242us 32.14% 1.615ms 538.382us 3.416ms 100.00% 4.566ms 1.522ms 3
4041
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.417ms 100.04% 3.417ms 3.417ms 1
4042
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.416ms 100.00% 3.416ms 1.139ms 3
4043
+ Activity Buffer Request 27.08% 1.361ms 27.08% 1.361ms 1.361ms 1.150ms 33.68% 1.150ms 1.150ms 1
4044
+ aten::empty 0.60% 30.030us 0.60% 30.030us 5.005us 0.000us 0.00% 0.000us 0.000us 6
4045
+ cudaFuncSetAttribute 0.10% 5.061us 0.10% 5.061us 1.687us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaLaunchKernel 3.36% 168.913us 3.36% 168.913us 56.304us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaDeviceSynchronize 63.61% 3.197ms 63.61% 3.197ms 3.197ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ Self CPU time total: 5.026ms
4050
+ Self CUDA time total: 3.416ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
  hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
4055
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4056
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
4057
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4058
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
4059
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4060
  </pre></div>
4061
+ <div class="cell-stderr">
4062
+ Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4063
+ Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 9.18it/s]
4064
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.23it/s]
4065
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.82it/s]
 
4066
  </div>
 
 
 
 
4067
  <div class="cell-artifacts">
4068
  <h4>Artifacts:</h4>
4069
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 35.14s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
- torch_mem_eff 4.95% 352.351us 32.76% 2.334ms 2.334ms 0.000us 0.00% 5.540ms 5.540ms 1
3928
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.523ms 100.61% 5.523ms 5.523ms 1
3929
- aten::scaled_dot_product_attention 0.42% 30.002us 2.65% 188.407us 62.802us 0.000us 0.00% 4.866ms 1.622ms 3
3930
- aten::_scaled_dot_product_efficient_attention 0.34% 24.112us 2.22% 158.405us 52.802us 0.000us 0.00% 4.866ms 1.622ms 3
3931
- aten::_efficient_attention_forward 0.50% 35.512us 1.50% 106.553us 35.518us 4.866ms 88.65% 4.866ms 1.622ms 3
3932
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.866ms 88.65% 4.866ms 1.622ms 3
3933
- aten::contiguous 0.17% 12.230us 24.19% 1.723ms 191.466us 0.000us 0.00% 673.885us 74.876us 9
3934
- aten::clone 0.48% 34.032us 24.02% 1.711ms 190.107us 0.000us 0.00% 673.885us 74.876us 9
3935
- aten::copy_ 1.04% 73.980us 22.51% 1.603ms 178.136us 623.037us 11.35% 673.885us 74.876us 9
3936
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 623.037us 11.35% 623.037us 69.226us 9
3937
- Activity Buffer Request 20.23% 1.441ms 20.23% 1.441ms 1.441ms 50.848us 0.93% 50.848us 50.848us 1
3938
- aten::transpose 1.03% 73.058us 1.37% 97.392us 4.058us 0.000us 0.00% 0.000us 0.000us 24
3939
- aten::as_strided 0.34% 24.334us 0.34% 24.334us 1.014us 0.000us 0.00% 0.000us 0.000us 24
3940
- aten::empty_like 0.28% 19.590us 1.03% 73.701us 8.189us 0.000us 0.00% 0.000us 0.000us 9
3941
- aten::empty 1.26% 89.621us 1.26% 89.621us 4.268us 0.000us 0.00% 0.000us 0.000us 21
3942
- cudaLaunchKernel 1.58% 112.598us 1.58% 112.598us 9.383us 0.000us 0.00% 0.000us 0.000us 12
3943
- cudaStreamIsCapturing 0.04% 3.160us 0.04% 3.160us 1.053us 0.000us 0.00% 0.000us 0.000us 3
3944
- cudaFuncSetAttribute 0.12% 8.400us 0.12% 8.400us 2.800us 0.000us 0.00% 0.000us 0.000us 3
3945
- cudaDeviceSynchronize 67.24% 4.789ms 67.24% 4.789ms 4.789ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
- Self CPU time total: 7.123ms
3948
- Self CUDA time total: 5.489ms
3949
 
3950
 
3951
 
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
- torch_mem_eff 3.15% 231.099us 27.84% 2.044ms 2.044ms 0.000us 0.00% 5.902ms 5.902ms 1
3959
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.856ms 100.14% 5.856ms 5.856ms 1
3960
- aten::scaled_dot_product_attention 0.26% 19.041us 1.91% 140.484us 46.828us 0.000us 0.00% 5.210ms 1.737ms 3
3961
- aten::_scaled_dot_product_efficient_attention 0.25% 18.340us 1.65% 121.443us 40.481us 0.000us 0.00% 5.210ms 1.737ms 3
3962
- aten::_efficient_attention_forward 0.40% 29.263us 1.10% 80.783us 26.928us 5.210ms 89.09% 5.210ms 1.737ms 3
3963
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.210ms 89.09% 5.210ms 1.737ms 3
3964
- aten::contiguous 0.10% 7.239us 22.19% 1.629ms 181.023us 0.000us 0.00% 692.607us 76.956us 9
3965
- aten::clone 0.29% 21.632us 22.09% 1.622ms 180.219us 0.000us 0.00% 692.607us 76.956us 9
3966
- aten::copy_ 0.87% 63.554us 21.13% 1.551ms 172.359us 638.271us 10.91% 692.607us 76.956us 9
3967
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 638.271us 10.91% 638.271us 70.919us 9
3968
- Activity Buffer Request 19.39% 1.423ms 19.39% 1.423ms 1.423ms 54.336us 0.93% 54.336us 54.336us 1
3969
- aten::transpose 0.66% 48.509us 0.89% 65.581us 2.733us 0.000us 0.00% 0.000us 0.000us 24
3970
- aten::as_strided 0.23% 17.072us 0.23% 17.072us 0.711us 0.000us 0.00% 0.000us 0.000us 24
3971
- aten::empty_like 0.16% 11.700us 0.67% 49.102us 5.456us 0.000us 0.00% 0.000us 0.000us 9
3972
- aten::empty 0.83% 61.232us 0.83% 61.232us 2.916us 0.000us 0.00% 0.000us 0.000us 21
3973
- cudaLaunchKernel 1.18% 86.372us 1.18% 86.372us 7.198us 0.000us 0.00% 0.000us 0.000us 12
3974
- cudaStreamIsCapturing 0.03% 2.340us 0.03% 2.340us 0.780us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaFuncSetAttribute 0.05% 3.500us 0.05% 3.500us 1.167us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaDeviceSynchronize 72.16% 5.297ms 72.16% 5.297ms 5.297ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
- Self CPU time total: 7.341ms
3979
- Self CUDA time total: 5.848ms
3980
 
3981
 
3982
 
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
- torch_mem_eff 2.94% 229.483us 29.69% 2.318ms 2.318ms 0.000us 0.00% 6.099ms 6.099ms 1
3990
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.049ms 100.14% 6.049ms 6.049ms 1
3991
- aten::scaled_dot_product_attention 0.23% 17.971us 1.79% 139.464us 46.488us 0.000us 0.00% 5.384ms 1.795ms 3
3992
- aten::_scaled_dot_product_efficient_attention 0.23% 18.090us 1.56% 121.493us 40.498us 0.000us 0.00% 5.384ms 1.795ms 3
3993
- aten::_efficient_attention_forward 0.36% 27.830us 1.04% 80.963us 26.988us 5.384ms 89.13% 5.384ms 1.795ms 3
3994
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.384ms 89.13% 5.384ms 1.795ms 3
3995
- aten::contiguous 0.09% 7.278us 24.41% 1.906ms 211.734us 0.000us 0.00% 714.652us 79.406us 9
3996
- aten::clone 0.28% 21.781us 24.31% 1.898ms 210.925us 0.000us 0.00% 714.652us 79.406us 9
3997
- aten::copy_ 0.80% 62.662us 23.36% 1.824ms 202.683us 656.540us 10.87% 714.652us 79.406us 9
3998
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.540us 10.87% 656.540us 72.949us 9
3999
- Activity Buffer Request 21.74% 1.697ms 21.74% 1.697ms 1.697ms 58.112us 0.96% 58.112us 58.112us 1
4000
- aten::transpose 0.63% 48.810us 0.84% 65.850us 2.744us 0.000us 0.00% 0.000us 0.000us 24
4001
- aten::as_strided 0.22% 17.040us 0.22% 17.040us 0.710us 0.000us 0.00% 0.000us 0.000us 24
4002
- aten::empty_like 0.14% 11.161us 0.67% 52.392us 5.821us 0.000us 0.00% 0.000us 0.000us 9
4003
- aten::empty 0.87% 67.583us 0.87% 67.583us 3.218us 0.000us 0.00% 0.000us 0.000us 21
4004
- cudaLaunchKernel 1.09% 85.261us 1.09% 85.261us 7.105us 0.000us 0.00% 0.000us 0.000us 12
4005
- cudaStreamIsCapturing 0.03% 2.451us 0.03% 2.451us 0.817us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaFuncSetAttribute 0.04% 3.290us 0.04% 3.290us 1.097us 0.000us 0.00% 0.000us 0.000us 3
4007
- cudaDeviceSynchronize 70.31% 5.490ms 70.31% 5.490ms 5.490ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
- Self CPU time total: 7.808ms
4010
- Self CUDA time total: 6.041ms
4011
 
4012
 
4013
 
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
- torch_mem_eff 2.96% 232.645us 28.95% 2.277ms 2.277ms 0.000us 0.00% 6.207ms 6.207ms 1
4021
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.157ms 100.14% 6.157ms 6.157ms 1
4022
- aten::scaled_dot_product_attention 0.23% 18.052us 1.76% 138.596us 46.199us 0.000us 0.00% 5.492ms 1.831ms 3
4023
- aten::_scaled_dot_product_efficient_attention 0.23% 17.731us 1.53% 120.544us 40.181us 0.000us 0.00% 5.492ms 1.831ms 3
4024
- aten::_efficient_attention_forward 0.35% 27.329us 1.02% 80.113us 26.704us 5.492ms 89.32% 5.492ms 1.831ms 3
4025
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.492ms 89.32% 5.492ms 1.831ms 3
4026
- aten::contiguous 0.09% 7.269us 23.67% 1.862ms 206.848us 0.000us 0.00% 714.624us 79.403us 9
4027
- aten::clone 0.28% 21.997us 23.58% 1.854ms 206.041us 0.000us 0.00% 714.624us 79.403us 9
4028
- aten::copy_ 0.89% 69.616us 22.61% 1.779ms 197.614us 656.513us 10.68% 714.624us 79.403us 9
4029
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.513us 10.68% 656.513us 72.946us 9
4030
- Activity Buffer Request 17.99% 1.415ms 17.99% 1.415ms 1.415ms 58.111us 0.95% 58.111us 58.111us 1
4031
- aten::transpose 0.63% 49.422us 0.84% 66.332us 2.764us 0.000us 0.00% 0.000us 0.000us 24
4032
- aten::as_strided 0.22% 16.910us 0.22% 16.910us 0.705us 0.000us 0.00% 0.000us 0.000us 24
4033
- aten::empty_like 0.15% 11.593us 0.68% 53.843us 5.983us 0.000us 0.00% 0.000us 0.000us 9
4034
- aten::empty 0.87% 68.381us 0.87% 68.381us 3.256us 0.000us 0.00% 0.000us 0.000us 21
4035
- cudaLaunchKernel 4.00% 314.941us 4.00% 314.941us 26.245us 0.000us 0.00% 0.000us 0.000us 12
4036
- cudaStreamIsCapturing 0.03% 2.380us 0.03% 2.380us 0.793us 0.000us 0.00% 0.000us 0.000us 3
4037
- cudaFuncSetAttribute 0.04% 3.242us 0.04% 3.242us 1.081us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaDeviceSynchronize 71.05% 5.588ms 71.05% 5.588ms 5.588ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
- Self CPU time total: 7.865ms
4041
- Self CUDA time total: 6.149ms
4042
 
4043
 
4044
 
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
- torch_mem_eff 2.91% 232.917us 28.19% 2.257ms 2.257ms 0.000us 0.00% 6.364ms 6.364ms 1
4052
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.313ms 100.13% 6.313ms 6.313ms 1
4053
- aten::scaled_dot_product_attention 0.22% 17.912us 1.77% 142.075us 47.358us 0.000us 0.00% 5.641ms 1.880ms 3
4054
- aten::_scaled_dot_product_efficient_attention 0.23% 18.730us 1.55% 124.163us 41.388us 0.000us 0.00% 5.641ms 1.880ms 3
4055
- aten::_efficient_attention_forward 0.36% 29.090us 1.02% 81.873us 27.291us 5.641ms 89.47% 5.641ms 1.880ms 3
4056
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.641ms 89.47% 5.641ms 1.880ms 3
4057
- aten::contiguous 0.09% 7.221us 22.98% 1.840ms 204.428us 0.000us 0.00% 723.455us 80.384us 9
4058
- aten::clone 0.27% 21.690us 22.89% 1.833ms 203.626us 0.000us 0.00% 723.455us 80.384us 9
4059
- aten::copy_ 0.78% 62.812us 21.99% 1.761ms 195.631us 663.839us 10.53% 723.455us 80.384us 9
4060
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.839us 10.53% 663.839us 73.760us 9
4061
- Activity Buffer Request 18.37% 1.471ms 18.37% 1.471ms 1.471ms 59.616us 0.95% 59.616us 59.616us 1
4062
- aten::transpose 0.60% 48.283us 0.82% 65.922us 2.747us 0.000us 0.00% 0.000us 0.000us 24
4063
- aten::as_strided 0.22% 17.639us 0.22% 17.639us 0.735us 0.000us 0.00% 0.000us 0.000us 24
4064
- aten::empty_like 0.15% 11.816us 0.63% 50.264us 5.585us 0.000us 0.00% 0.000us 0.000us 9
4065
- aten::empty 0.80% 63.840us 0.80% 63.840us 3.040us 0.000us 0.00% 0.000us 0.000us 21
4066
- cudaLaunchKernel 3.11% 249.257us 3.11% 249.257us 20.771us 0.000us 0.00% 0.000us 0.000us 12
4067
- cudaStreamIsCapturing 0.03% 2.260us 0.03% 2.260us 0.753us 0.000us 0.00% 0.000us 0.000us 3
4068
- cudaFuncSetAttribute 0.04% 3.100us 0.04% 3.100us 1.033us 0.000us 0.00% 0.000us 0.000us 3
4069
- cudaDeviceSynchronize 71.81% 5.750ms 71.81% 5.750ms 5.750ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- Self CPU time total: 8.007ms
4072
- Self CUDA time total: 6.304ms
4073
 
4074
 
4075
 
@@ -4079,90 +4079,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
- torch_mem_eff 3.10% 262.407us 28.45% 2.407ms 2.407ms 0.000us 0.00% 6.700ms 6.700ms 1
4083
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.648ms 100.13% 6.648ms 6.648ms 1
4084
- aten::scaled_dot_product_attention 0.22% 18.361us 1.72% 145.216us 48.405us 0.000us 0.00% 5.968ms 1.989ms 3
4085
- aten::_scaled_dot_product_efficient_attention 0.22% 18.717us 1.50% 126.855us 42.285us 0.000us 0.00% 5.968ms 1.989ms 3
4086
- aten::_efficient_attention_forward 0.34% 29.081us 1.00% 84.393us 28.131us 5.968ms 89.89% 5.968ms 1.989ms 3
4087
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.968ms 89.89% 5.968ms 1.989ms 3
4088
- aten::contiguous 0.09% 7.641us 23.04% 1.949ms 216.566us 0.000us 0.00% 731.964us 81.329us 9
4089
- aten::clone 0.29% 24.377us 22.95% 1.941ms 215.717us 0.000us 0.00% 731.964us 81.329us 9
4090
- aten::copy_ 0.80% 68.015us 22.01% 1.862ms 206.906us 670.941us 10.11% 731.964us 81.329us 9
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 670.941us 10.11% 670.941us 74.549us 9
4092
- Activity Buffer Request 17.04% 1.441ms 17.04% 1.441ms 1.441ms 61.023us 0.92% 61.023us 61.023us 1
4093
- aten::transpose 0.67% 56.417us 0.87% 73.607us 3.067us 0.000us 0.00% 0.000us 0.000us 24
4094
- aten::as_strided 0.20% 17.190us 0.20% 17.190us 0.716us 0.000us 0.00% 0.000us 0.000us 24
4095
- aten::empty_like 0.14% 12.051us 0.65% 54.922us 6.102us 0.000us 0.00% 0.000us 0.000us 9
4096
- aten::empty 0.83% 69.821us 0.83% 69.821us 3.325us 0.000us 0.00% 0.000us 0.000us 21
4097
- cudaLaunchKernel 4.44% 375.855us 4.44% 375.855us 31.321us 0.000us 0.00% 0.000us 0.000us 12
4098
- cudaStreamIsCapturing 0.03% 2.230us 0.03% 2.230us 0.743us 0.000us 0.00% 0.000us 0.000us 3
4099
- cudaFuncSetAttribute 0.04% 3.250us 0.04% 3.250us 1.083us 0.000us 0.00% 0.000us 0.000us 3
4100
- cudaDeviceSynchronize 71.55% 6.053ms 71.55% 6.053ms 6.053ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
- Self CPU time total: 8.459ms
4103
- Self CUDA time total: 6.639ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
- torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
4108
- torch_mem_eff cuda_attn_L256_bfloat16 1.99 True
4109
- torch_mem_eff cuda_attn_L320_bfloat16 2.02 True
4110
  torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
4111
- torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
4112
- torch_mem_eff cuda_attn_L512_bfloat16 2.22 True
4113
  </pre></div>
4114
- <div class="uv-install-logs" id="uv-logs-benchmark">
4115
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4116
- <div class="uv-logs-content" style="display: none;">
4117
- Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4118
- Downloading setuptools (1.1MiB)
4119
- Downloading nvidia-curand-cu12 (60.7MiB)
4120
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4121
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4122
- Downloading nvidia-nccl-cu12 (307.4MiB)
4123
- Downloading matplotlib (8.3MiB)
4124
- Downloading nvidia-cufile-cu12 (1.1MiB)
4125
- Downloading nvidia-cufft-cu12 (184.2MiB)
4126
- Downloading pillow (6.7MiB)
4127
- Downloading fonttools (4.7MiB)
4128
- Downloading numpy (16.2MiB)
4129
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4130
- Downloading torch (846.9MiB)
4131
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4132
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4133
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4134
- Downloading kiwisolver (1.4MiB)
4135
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4136
- Downloading sympy (6.0MiB)
4137
- Downloading triton (148.3MiB)
4138
- Downloading networkx (1.9MiB)
4139
- Downloading nvidia-cublas-cu12 (566.8MiB)
4140
- Downloading nvidia-cufile-cu12
4141
- Downloading kiwisolver
4142
- Downloading setuptools
4143
- Downloading fonttools
4144
- Downloading networkx
4145
- Downloading pillow
4146
- Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4147
- Downloading nvidia-cuda-cupti-cu12
4148
- Downloading matplotlib
4149
- Downloading numpy
4150
- Downloading sympy
4151
- Downloading nvidia-nvjitlink-cu12
4152
- Downloading nvidia-curand-cu12
4153
- Downloading nvidia-cuda-nvrtc-cu12
4154
- Downloading triton
4155
- Downloading nvidia-cufft-cu12
4156
- Downloading nvidia-cusolver-cu12
4157
- Downloading nvidia-cusparse-cu12
4158
- Downloading nvidia-cusparselt-cu12
4159
- Downloading nvidia-nccl-cu12
4160
- Downloading nvidia-cublas-cu12
4161
- Downloading nvidia-cudnn-cu12
4162
- Downloading torch
4163
- Installed 37 packages in 223ms
4164
- </div>
4165
- </div>
4166
  <div class="cell-artifacts">
4167
  <h4>Artifacts:</h4>
4168
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 3.92s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
+ torch_mem_eff 4.88% 347.876us 33.28% 2.372ms 2.372ms 0.000us 0.00% 5.473ms 5.473ms 1
3928
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.465ms 100.77% 5.465ms 5.465ms 1
3929
+ aten::scaled_dot_product_attention 0.44% 31.501us 2.47% 176.074us 58.691us 0.000us 0.00% 4.806ms 1.602ms 3
3930
+ aten::_scaled_dot_product_efficient_attention 0.33% 23.351us 2.03% 144.573us 48.191us 0.000us 0.00% 4.806ms 1.602ms 3
3931
+ aten::_efficient_attention_forward 0.48% 33.995us 1.40% 99.622us 33.207us 4.806ms 88.63% 4.806ms 1.602ms 3
3932
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.806ms 88.63% 4.806ms 1.602ms 3
3933
+ aten::contiguous 0.20% 13.962us 24.98% 1.780ms 197.762us 0.000us 0.00% 667.264us 74.140us 9
3934
+ aten::clone 0.48% 34.432us 24.78% 1.766ms 196.211us 0.000us 0.00% 667.264us 74.140us 9
3935
+ aten::copy_ 1.03% 73.682us 23.27% 1.658ms 184.268us 616.768us 11.37% 667.264us 74.140us 9
3936
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 616.768us 11.37% 616.768us 68.530us 9
3937
+ Activity Buffer Request 21.06% 1.501ms 21.06% 1.501ms 1.501ms 50.496us 0.93% 50.496us 50.496us 1
3938
+ aten::transpose 0.94% 67.099us 1.26% 89.541us 3.731us 0.000us 0.00% 0.000us 0.000us 24
3939
+ aten::as_strided 0.31% 22.442us 0.31% 22.442us 0.935us 0.000us 0.00% 0.000us 0.000us 24
3940
+ aten::empty_like 0.26% 18.431us 1.03% 73.051us 8.117us 0.000us 0.00% 0.000us 0.000us 9
3941
+ aten::empty 1.15% 82.238us 1.15% 82.238us 3.916us 0.000us 0.00% 0.000us 0.000us 21
3942
+ cudaLaunchKernel 1.53% 109.170us 1.53% 109.170us 9.098us 0.000us 0.00% 0.000us 0.000us 12
3943
+ cudaStreamIsCapturing 0.04% 3.169us 0.04% 3.169us 1.056us 0.000us 0.00% 0.000us 0.000us 3
3944
+ cudaFuncSetAttribute 0.13% 9.530us 0.13% 9.530us 3.177us 0.000us 0.00% 0.000us 0.000us 3
3945
+ cudaDeviceSynchronize 66.72% 4.754ms 66.72% 4.754ms 4.754ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
+ Self CPU time total: 7.126ms
3948
+ Self CUDA time total: 5.423ms
3949
 
3950
 
3951
 
 
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
+ torch_mem_eff 3.49% 251.026us 29.53% 2.123ms 2.123ms 0.000us 0.00% 5.671ms 5.671ms 1
3959
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.625ms 100.14% 5.625ms 5.625ms 1
3960
+ aten::scaled_dot_product_attention 0.28% 19.941us 1.97% 141.843us 47.281us 0.000us 0.00% 4.980ms 1.660ms 3
3961
+ aten::_scaled_dot_product_efficient_attention 0.25% 17.669us 1.70% 121.902us 40.634us 0.000us 0.00% 4.980ms 1.660ms 3
3962
+ aten::_efficient_attention_forward 0.38% 27.651us 1.14% 82.182us 27.394us 4.980ms 88.66% 4.980ms 1.660ms 3
3963
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.980ms 88.66% 4.980ms 1.660ms 3
3964
+ aten::contiguous 0.10% 7.480us 23.48% 1.688ms 187.567us 0.000us 0.00% 691.071us 76.786us 9
3965
+ aten::clone 0.30% 21.261us 23.38% 1.681ms 186.736us 0.000us 0.00% 691.071us 76.786us 9
3966
+ aten::copy_ 0.85% 60.983us 22.39% 1.610ms 178.842us 637.247us 11.34% 691.071us 76.786us 9
3967
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 637.247us 11.34% 637.247us 70.805us 9
3968
+ Activity Buffer Request 20.63% 1.483ms 20.63% 1.483ms 1.483ms 53.824us 0.96% 53.824us 53.824us 1
3969
+ aten::transpose 0.67% 48.164us 0.89% 64.122us 2.672us 0.000us 0.00% 0.000us 0.000us 24
3970
+ aten::as_strided 0.22% 15.958us 0.22% 15.958us 0.665us 0.000us 0.00% 0.000us 0.000us 24
3971
+ aten::empty_like 0.16% 11.580us 0.69% 49.790us 5.532us 0.000us 0.00% 0.000us 0.000us 9
3972
+ aten::empty 0.89% 63.701us 0.89% 63.701us 3.033us 0.000us 0.00% 0.000us 0.000us 21
3973
+ cudaLaunchKernel 1.22% 87.751us 1.22% 87.751us 7.313us 0.000us 0.00% 0.000us 0.000us 12
3974
+ cudaStreamIsCapturing 0.04% 3.090us 0.04% 3.090us 1.030us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaFuncSetAttribute 0.05% 3.339us 0.05% 3.339us 1.113us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaDeviceSynchronize 70.47% 5.066ms 70.47% 5.066ms 5.066ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
+ Self CPU time total: 7.190ms
3979
+ Self CUDA time total: 5.617ms
3980
 
3981
 
3982
 
 
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
+ torch_mem_eff 3.37% 266.115us 31.17% 2.458ms 2.458ms 0.000us 0.00% 6.082ms 6.082ms 1
3990
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.032ms 100.14% 6.032ms 6.032ms 1
3991
+ aten::scaled_dot_product_attention 0.25% 19.720us 1.92% 151.403us 50.468us 0.000us 0.00% 5.369ms 1.790ms 3
3992
+ aten::_scaled_dot_product_efficient_attention 0.24% 18.800us 1.67% 131.683us 43.894us 0.000us 0.00% 5.369ms 1.790ms 3
3993
+ aten::_efficient_attention_forward 0.36% 28.452us 1.04% 81.963us 27.321us 5.369ms 89.14% 5.369ms 1.790ms 3
3994
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.369ms 89.14% 5.369ms 1.790ms 3
3995
+ aten::contiguous 0.10% 7.851us 25.32% 1.997ms 221.887us 0.000us 0.00% 712.865us 79.207us 9
3996
+ aten::clone 0.51% 40.412us 25.22% 1.989ms 221.015us 0.000us 0.00% 712.865us 79.207us 9
3997
+ aten::copy_ 0.83% 65.138us 24.07% 1.898ms 210.924us 654.369us 10.86% 712.865us 79.207us 9
3998
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.369us 10.86% 654.369us 72.708us 9
3999
+ Activity Buffer Request 22.37% 1.764ms 22.37% 1.764ms 1.764ms 58.496us 0.97% 58.496us 58.496us 1
4000
+ aten::transpose 0.63% 49.872us 0.95% 74.812us 3.117us 0.000us 0.00% 0.000us 0.000us 24
4001
+ aten::as_strided 0.32% 24.940us 0.32% 24.940us 1.039us 0.000us 0.00% 0.000us 0.000us 24
4002
+ aten::empty_like 0.15% 11.509us 0.64% 50.401us 5.600us 0.000us 0.00% 0.000us 0.000us 9
4003
+ aten::empty 0.82% 64.330us 0.82% 64.330us 3.063us 0.000us 0.00% 0.000us 0.000us 21
4004
+ cudaLaunchKernel 1.16% 91.554us 1.16% 91.554us 7.629us 0.000us 0.00% 0.000us 0.000us 12
4005
+ cudaStreamIsCapturing 0.03% 2.671us 0.03% 2.671us 0.890us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaFuncSetAttribute 0.04% 3.101us 0.04% 3.101us 1.034us 0.000us 0.00% 0.000us 0.000us 3
4007
+ cudaDeviceSynchronize 68.83% 5.428ms 68.83% 5.428ms 5.428ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
+ Self CPU time total: 7.886ms
4010
+ Self CUDA time total: 6.024ms
4011
 
4012
 
4013
 
 
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
+ torch_mem_eff 4.19% 329.379us 30.22% 2.377ms 2.377ms 0.000us 0.00% 6.195ms 6.195ms 1
4021
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.146ms 100.15% 6.146ms 6.146ms 1
4022
+ aten::scaled_dot_product_attention 0.26% 20.400us 1.80% 141.523us 47.174us 0.000us 0.00% 5.484ms 1.828ms 3
4023
+ aten::_scaled_dot_product_efficient_attention 0.23% 17.780us 1.54% 121.123us 40.374us 0.000us 0.00% 5.484ms 1.828ms 3
4024
+ aten::_efficient_attention_forward 0.36% 28.239us 1.03% 81.303us 27.101us 5.484ms 89.36% 5.484ms 1.828ms 3
4025
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.484ms 89.36% 5.484ms 1.828ms 3
4026
+ aten::contiguous 0.10% 8.071us 23.69% 1.863ms 207.042us 0.000us 0.00% 711.166us 79.018us 9
4027
+ aten::clone 0.27% 21.510us 23.59% 1.855ms 206.145us 0.000us 0.00% 711.166us 79.018us 9
4028
+ aten::copy_ 0.81% 63.940us 22.65% 1.781ms 197.883us 652.767us 10.64% 711.166us 79.018us 9
4029
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 652.767us 10.64% 652.767us 72.530us 9
4030
+ Activity Buffer Request 18.20% 1.431ms 18.20% 1.431ms 1.431ms 58.399us 0.95% 58.399us 58.399us 1
4031
+ aten::transpose 0.61% 48.309us 0.82% 64.340us 2.681us 0.000us 0.00% 0.000us 0.000us 24
4032
+ aten::as_strided 0.20% 16.031us 0.20% 16.031us 0.668us 0.000us 0.00% 0.000us 0.000us 24
4033
+ aten::empty_like 0.14% 11.029us 0.67% 52.851us 5.872us 0.000us 0.00% 0.000us 0.000us 9
4034
+ aten::empty 0.84% 66.365us 0.84% 66.365us 3.160us 0.000us 0.00% 0.000us 0.000us 21
4035
+ cudaLaunchKernel 3.91% 307.476us 3.91% 307.476us 25.623us 0.000us 0.00% 0.000us 0.000us 12
4036
+ cudaStreamIsCapturing 0.03% 2.550us 0.03% 2.550us 0.850us 0.000us 0.00% 0.000us 0.000us 3
4037
+ cudaFuncSetAttribute 0.05% 4.011us 0.05% 4.011us 1.337us 0.000us 0.00% 0.000us 0.000us 3
4038
+ cudaDeviceSynchronize 69.78% 5.488ms 69.78% 5.488ms 5.488ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
+ Self CPU time total: 7.864ms
4041
+ Self CUDA time total: 6.137ms
4042
 
4043
 
4044
 
 
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
+ torch_mem_eff 3.07% 246.275us 28.09% 2.251ms 2.251ms 0.000us 0.00% 6.379ms 6.379ms 1
4052
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.328ms 100.14% 6.328ms 6.328ms 1
4053
+ aten::scaled_dot_product_attention 0.24% 19.011us 1.78% 142.253us 47.418us 0.000us 0.00% 5.653ms 1.884ms 3
4054
+ aten::_scaled_dot_product_efficient_attention 0.24% 19.261us 1.54% 123.242us 41.081us 0.000us 0.00% 5.653ms 1.884ms 3
4055
+ aten::_efficient_attention_forward 0.35% 28.069us 1.02% 81.511us 27.170us 5.653ms 89.46% 5.653ms 1.884ms 3
4056
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.653ms 89.46% 5.653ms 1.884ms 3
4057
+ aten::contiguous 0.10% 7.649us 22.70% 1.819ms 202.115us 0.000us 0.00% 725.600us 80.622us 9
4058
+ aten::clone 0.27% 22.011us 22.61% 1.811ms 201.265us 0.000us 0.00% 725.600us 80.622us 9
4059
+ aten::copy_ 0.79% 63.041us 21.68% 1.737ms 193.055us 666.112us 10.54% 725.600us 80.622us 9
4060
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 666.112us 10.54% 666.112us 74.012us 9
4061
+ Activity Buffer Request 18.14% 1.453ms 18.14% 1.453ms 1.453ms 59.488us 0.94% 59.488us 59.488us 1
4062
+ aten::transpose 0.62% 49.849us 0.82% 66.103us 2.754us 0.000us 0.00% 0.000us 0.000us 24
4063
+ aten::as_strided 0.20% 16.254us 0.20% 16.254us 0.677us 0.000us 0.00% 0.000us 0.000us 24
4064
+ aten::empty_like 0.15% 11.889us 0.65% 51.880us 5.764us 0.000us 0.00% 0.000us 0.000us 9
4065
+ aten::empty 0.80% 64.291us 0.80% 64.291us 3.061us 0.000us 0.00% 0.000us 0.000us 21
4066
+ cudaLaunchKernel 3.04% 243.917us 3.04% 243.917us 20.326us 0.000us 0.00% 0.000us 0.000us 12
4067
+ cudaStreamIsCapturing 0.04% 3.200us 0.04% 3.200us 1.067us 0.000us 0.00% 0.000us 0.000us 3
4068
+ cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
4069
+ cudaDeviceSynchronize 71.91% 5.762ms 71.91% 5.762ms 5.762ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ Self CPU time total: 8.013ms
4072
+ Self CUDA time total: 6.319ms
4073
 
4074
 
4075
 
 
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
+ torch_mem_eff 2.99% 249.826us 26.96% 2.254ms 2.254ms 0.000us 0.00% 6.738ms 6.738ms 1
4083
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.686ms 100.13% 6.686ms 6.686ms 1
4084
+ aten::scaled_dot_product_attention 0.22% 18.532us 1.72% 143.464us 47.821us 0.000us 0.00% 6.005ms 2.002ms 3
4085
+ aten::_scaled_dot_product_efficient_attention 0.24% 19.750us 1.49% 124.932us 41.644us 0.000us 0.00% 6.005ms 2.002ms 3
4086
+ aten::_efficient_attention_forward 0.34% 28.159us 0.97% 81.312us 27.104us 6.005ms 89.92% 6.005ms 2.002ms 3
4087
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.005ms 89.92% 6.005ms 2.002ms 3
4088
+ aten::contiguous 0.11% 8.892us 21.70% 1.814ms 201.591us 0.000us 0.00% 733.564us 81.507us 9
4089
+ aten::clone 0.28% 23.489us 21.59% 1.805ms 200.603us 0.000us 0.00% 733.564us 81.507us 9
4090
+ aten::copy_ 0.78% 65.381us 20.67% 1.729ms 192.090us 672.957us 10.08% 733.564us 81.507us 9
4091
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 672.957us 10.08% 672.957us 74.773us 9
4092
+ Activity Buffer Request 17.24% 1.442ms 17.24% 1.442ms 1.442ms 60.607us 0.91% 60.607us 60.607us 1
4093
+ aten::transpose 0.64% 53.558us 0.84% 70.590us 2.941us 0.000us 0.00% 0.000us 0.000us 24
4094
+ aten::as_strided 0.20% 17.032us 0.20% 17.032us 0.710us 0.000us 0.00% 0.000us 0.000us 24
4095
+ aten::empty_like 0.15% 12.490us 0.64% 53.131us 5.903us 0.000us 0.00% 0.000us 0.000us 9
4096
+ aten::empty 0.79% 65.813us 0.79% 65.813us 3.134us 0.000us 0.00% 0.000us 0.000us 21
4097
+ cudaLaunchKernel 2.91% 243.356us 2.91% 243.356us 20.280us 0.000us 0.00% 0.000us 0.000us 12
4098
+ cudaStreamIsCapturing 0.04% 3.000us 0.04% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3
4099
+ cudaFuncSetAttribute 0.04% 3.289us 0.04% 3.289us 1.096us 0.000us 0.00% 0.000us 0.000us 3
4100
+ cudaDeviceSynchronize 73.04% 6.108ms 73.04% 6.108ms 6.108ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
+ Self CPU time total: 8.362ms
4103
+ Self CUDA time total: 6.678ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
+ torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
4108
+ torch_mem_eff cuda_attn_L256_bfloat16 1.91 True
4109
+ torch_mem_eff cuda_attn_L320_bfloat16 1.96 True
4110
  torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
4111
+ torch_mem_eff cuda_attn_L448_bfloat16 2.10 True
4112
+ torch_mem_eff cuda_attn_L512_bfloat16 2.18 True
4113
  </pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4114
  <div class="cell-artifacts">
4115
  <h4>Artifacts:</h4>
4116
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/sage_attention.html CHANGED
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 4.32s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3920,23 +3920,28 @@ Cell: benchmark | 4.32s
3920
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3921
  impl wl p50(ms) ok
3922
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3923
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
3924
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3925
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
3926
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3927
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
3928
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3929
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
3930
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3931
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
3932
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3933
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
3934
  </pre></div>
3935
- <div class="cell-stderr">
3936
- Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3937
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 12.32it/s]
3938
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 16.93it/s]
3939
  </div>
 
 
 
 
 
3940
  <div class="cell-artifacts">
3941
  <h4>Artifacts:</h4>
3942
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 4.85s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3920
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3921
  impl wl p50(ms) ok
3922
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3923
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
3924
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3925
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
3926
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3927
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
3928
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3929
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
3930
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3931
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
3932
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3933
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
3934
  </pre></div>
3935
+ <div class="uv-install-logs" id="uv-logs-benchmark">
3936
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3937
+ <div class="uv-logs-content" style="display: none;">
3938
+ Installed 15 packages in 14ms
3939
  </div>
3940
+ </div>
3941
+ <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3942
+ Fetching 11 files: 27%|██▋ | 3/11 [00:00&lt;00:00, 25.80it/s]
3943
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 12.20it/s]
3944
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 17.81it/s]</div>
3945
  <div class="cell-artifacts">
3946
  <h4>Artifacts:</h4>
3947
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 5.56s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
- xformers_meff 10.99% 493.828us 51.93% 2.334ms 2.334ms 0.000us 0.00% 3.600ms 3.600ms 1
3927
- xformers_flash3::flash_fwd 4.32% 194.118us 40.08% 1.801ms 600.437us 0.000us 0.00% 3.600ms 1.200ms 3
3928
- flash_attn_3::fwd 1.81% 81.292us 35.76% 1.607ms 535.731us 2.714ms 100.00% 3.600ms 1.200ms 3
3929
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.716ms 100.05% 2.716ms 2.716ms 1
3930
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.714ms 100.00% 2.714ms 904.730us 3
3931
- Activity Buffer Request 31.96% 1.436ms 31.96% 1.436ms 1.436ms 885.349us 32.62% 885.349us 885.349us 1
3932
- aten::empty 0.86% 38.850us 0.86% 38.850us 6.475us 0.000us 0.00% 0.000us 0.000us 6
3933
- cudaFuncSetAttribute 0.25% 11.022us 0.25% 11.022us 3.674us 0.000us 0.00% 0.000us 0.000us 3
3934
- cudaLaunchKernel 0.88% 39.751us 0.88% 39.751us 13.250us 0.000us 0.00% 0.000us 0.000us 3
3935
- aten::reshape 0.26% 11.630us 0.87% 38.970us 6.495us 0.000us 0.00% 0.000us 0.000us 6
3936
- aten::view 0.61% 27.340us 0.61% 27.340us 4.557us 0.000us 0.00% 0.000us 0.000us 6
3937
- cudaDeviceSynchronize 48.07% 2.160ms 48.07% 2.160ms 2.160ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 4.494ms
3940
- Self CUDA time total: 2.714ms
3941
 
3942
 
3943
 
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- xformers_meff 7.45% 327.551us 47.96% 2.108ms 2.108ms 0.000us 0.00% 3.684ms 3.684ms 1
3951
- xformers_flash3::flash_fwd 3.56% 156.647us 39.91% 1.754ms 584.750us 0.000us 0.00% 3.684ms 1.228ms 3
3952
- flash_attn_3::fwd 1.31% 57.602us 36.35% 1.598ms 532.534us 2.754ms 100.00% 3.684ms 1.228ms 3
3953
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.06% 2.755ms 2.755ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.00% 2.754ms 917.895us 3
3955
- Activity Buffer Request 33.31% 1.464ms 33.31% 1.464ms 1.464ms 930.812us 33.80% 930.812us 930.812us 1
3956
- aten::empty 0.76% 33.251us 0.76% 33.251us 5.542us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.14% 6.040us 0.14% 6.040us 2.013us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.83% 36.590us 0.83% 36.590us 12.197us 0.000us 0.00% 0.000us 0.000us 3
3959
- aten::reshape 0.23% 10.130us 0.60% 26.441us 4.407us 0.000us 0.00% 0.000us 0.000us 6
3960
- aten::view 0.37% 16.311us 0.37% 16.311us 2.719us 0.000us 0.00% 0.000us 0.000us 6
3961
- cudaDeviceSynchronize 52.04% 2.287ms 52.04% 2.287ms 2.287ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
- Self CPU time total: 4.395ms
3964
- Self CUDA time total: 2.754ms
3965
 
3966
 
3967
 
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
- xformers_meff 6.93% 309.631us 45.92% 2.051ms 2.051ms 0.000us 0.00% 3.806ms 3.806ms 1
3975
- xformers_flash3::flash_fwd 3.88% 173.206us 38.45% 1.717ms 572.356us 0.000us 0.00% 3.806ms 1.269ms 3
3976
- flash_attn_3::fwd 1.30% 58.031us 34.57% 1.544ms 514.621us 2.838ms 100.00% 3.806ms 1.269ms 3
3977
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.840ms 100.06% 2.840ms 2.840ms 1
3978
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.838ms 100.00% 2.838ms 945.948us 3
3979
- Activity Buffer Request 31.70% 1.416ms 31.70% 1.416ms 1.416ms 968.572us 34.13% 968.572us 968.572us 1
3980
- aten::empty 0.70% 31.373us 0.70% 31.373us 5.229us 0.000us 0.00% 0.000us 0.000us 6
3981
- cudaFuncSetAttribute 0.12% 5.510us 0.12% 5.510us 1.837us 0.000us 0.00% 0.000us 0.000us 3
3982
- cudaLaunchKernel 0.74% 33.081us 0.74% 33.081us 11.027us 0.000us 0.00% 0.000us 0.000us 3
3983
- aten::reshape 0.19% 8.679us 0.54% 24.060us 4.010us 0.000us 0.00% 0.000us 0.000us 6
3984
- aten::view 0.34% 15.381us 0.34% 15.381us 2.564us 0.000us 0.00% 0.000us 0.000us 6
3985
- cudaDeviceSynchronize 54.08% 2.416ms 54.08% 2.416ms 2.416ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
- Self CPU time total: 4.466ms
3988
- Self CUDA time total: 2.838ms
3989
 
3990
 
3991
 
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
- xformers_meff 6.70% 313.562us 47.60% 2.227ms 2.227ms 0.000us 0.00% 3.863ms 3.863ms 1
3999
- xformers_flash3::flash_fwd 3.24% 151.796us 40.34% 1.888ms 629.212us 0.000us 0.00% 3.863ms 1.288ms 3
4000
- flash_attn_3::fwd 1.25% 58.574us 37.10% 1.736ms 578.613us 2.888ms 100.00% 3.863ms 1.288ms 3
4001
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1
4002
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.743us 3
4003
- Activity Buffer Request 30.65% 1.434ms 30.65% 1.434ms 1.434ms 974.434us 33.74% 974.434us 974.434us 1
4004
- aten::empty 0.64% 30.051us 0.64% 30.051us 5.008us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaFuncSetAttribute 0.12% 5.730us 0.12% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaLaunchKernel 4.43% 207.206us 4.43% 207.206us 69.069us 0.000us 0.00% 0.000us 0.000us 3
4007
- aten::reshape 0.22% 10.139us 0.56% 26.119us 4.353us 0.000us 0.00% 0.000us 0.000us 6
4008
- aten::view 0.34% 15.980us 0.34% 15.980us 2.663us 0.000us 0.00% 0.000us 0.000us 6
4009
- cudaDeviceSynchronize 52.40% 2.452ms 52.40% 2.452ms 2.452ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- Self CPU time total: 4.679ms
4012
- Self CUDA time total: 2.888ms
4013
 
4014
 
4015
 
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- xformers_meff 6.05% 310.689us 42.88% 2.201ms 2.201ms 0.000us 0.00% 4.489ms 4.489ms 1
4023
- xformers_flash3::flash_fwd 2.93% 150.475us 36.35% 1.866ms 622.001us 0.000us 0.00% 4.489ms 1.496ms 3
4024
- flash_attn_3::fwd 1.04% 53.593us 33.42% 1.716ms 571.843us 3.365ms 100.00% 4.489ms 1.496ms 3
4025
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.367ms 100.05% 3.367ms 3.367ms 1
4026
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.365ms 100.00% 3.365ms 1.122ms 3
4027
- Activity Buffer Request 28.02% 1.439ms 28.02% 1.439ms 1.439ms 1.123ms 33.38% 1.123ms 1.123ms 1
4028
- aten::empty 0.59% 30.191us 0.59% 30.191us 5.032us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaFuncSetAttribute 0.12% 6.030us 0.12% 6.030us 2.010us 0.000us 0.00% 0.000us 0.000us 3
4030
- cudaLaunchKernel 3.65% 187.166us 3.65% 187.166us 62.389us 0.000us 0.00% 0.000us 0.000us 3
4031
- aten::reshape 0.18% 9.272us 0.47% 24.322us 4.054us 0.000us 0.00% 0.000us 0.000us 6
4032
- aten::view 0.29% 15.050us 0.29% 15.050us 2.508us 0.000us 0.00% 0.000us 0.000us 6
4033
- cudaDeviceSynchronize 57.12% 2.932ms 57.12% 2.932ms 2.932ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
- Self CPU time total: 5.133ms
4036
- Self CUDA time total: 3.365ms
4037
 
4038
 
4039
 
@@ -4043,37 +4043,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
- xformers_meff 6.40% 331.462us 43.16% 2.236ms 2.236ms 0.000us 0.00% 4.557ms 4.557ms 1
4047
- xformers_flash3::flash_fwd 2.99% 154.686us 36.26% 1.879ms 626.255us 0.000us 0.00% 4.557ms 1.519ms 3
4048
- flash_attn_3::fwd 1.13% 58.511us 33.27% 1.724ms 574.693us 3.413ms 100.00% 4.557ms 1.519ms 3
4049
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.415ms 100.05% 3.415ms 3.415ms 1
4050
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.00% 3.413ms 1.138ms 3
4051
- Activity Buffer Request 27.70% 1.435ms 27.70% 1.435ms 1.435ms 1.144ms 33.52% 1.144ms 1.144ms 1
4052
- aten::empty 0.61% 31.572us 0.61% 31.572us 5.262us 0.000us 0.00% 0.000us 0.000us 6
4053
- cudaFuncSetAttribute 0.11% 5.890us 0.11% 5.890us 1.963us 0.000us 0.00% 0.000us 0.000us 3
4054
- cudaLaunchKernel 3.72% 192.906us 3.72% 192.906us 64.302us 0.000us 0.00% 0.000us 0.000us 3
4055
- aten::reshape 0.18% 9.270us 0.50% 26.000us 4.333us 0.000us 0.00% 0.000us 0.000us 6
4056
- aten::view 0.32% 16.730us 0.32% 16.730us 2.788us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaDeviceSynchronize 56.84% 2.946ms 56.84% 2.946ms 2.946ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- Self CPU time total: 5.182ms
4060
- Self CUDA time total: 3.413ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
- xformers_meff cuda_attn_L128_bfloat16 0.98 True
4065
- xformers_meff cuda_attn_L256_bfloat16 1.02 True
4066
- xformers_meff cuda_attn_L320_bfloat16 1.07 True
4067
- xformers_meff cuda_attn_L384_bfloat16 1.08 True
4068
  xformers_meff cuda_attn_L448_bfloat16 1.24 True
4069
- xformers_meff cuda_attn_L512_bfloat16 1.23 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4073
  <div class="uv-logs-content" style="display: none;">
4074
  Downloading xformers (111.8MiB)
4075
  Downloading xformers
4076
- Installed 1 package in 13ms
4077
  </div>
4078
  </div>
4079
  <div class="cell-artifacts">
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 8.72s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
+ xformers_meff 10.73% 480.812us 51.38% 2.302ms 2.302ms 0.000us 0.00% 3.631ms 3.631ms 1
3927
+ xformers_flash3::flash_fwd 4.61% 206.363us 39.81% 1.783ms 594.453us 0.000us 0.00% 3.631ms 1.210ms 3
3928
+ flash_attn_3::fwd 1.72% 77.043us 35.21% 1.577ms 525.665us 2.730ms 100.00% 3.631ms 1.210ms 3
3929
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.731ms 100.06% 2.731ms 2.731ms 1
3930
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.730ms 100.00% 2.730ms 909.864us 3
3931
+ Activity Buffer Request 31.52% 1.412ms 31.52% 1.412ms 1.412ms 901.213us 33.02% 901.213us 901.213us 1
3932
+ aten::empty 0.77% 34.510us 0.77% 34.510us 5.752us 0.000us 0.00% 0.000us 0.000us 6
3933
+ cudaFuncSetAttribute 0.24% 10.880us 0.24% 10.880us 3.627us 0.000us 0.00% 0.000us 0.000us 3
3934
+ cudaLaunchKernel 0.96% 42.842us 0.96% 42.842us 14.281us 0.000us 0.00% 0.000us 0.000us 3
3935
+ aten::reshape 0.26% 11.610us 0.84% 37.430us 6.238us 0.000us 0.00% 0.000us 0.000us 6
3936
+ aten::view 0.58% 25.820us 0.58% 25.820us 4.303us 0.000us 0.00% 0.000us 0.000us 6
3937
+ cudaDeviceSynchronize 48.62% 2.178ms 48.62% 2.178ms 2.178ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
+ Self CPU time total: 4.479ms
3940
+ Self CUDA time total: 2.730ms
3941
 
3942
 
3943
 
 
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ xformers_meff 7.14% 318.116us 45.64% 2.033ms 2.033ms 0.000us 0.00% 3.819ms 3.819ms 1
3951
+ xformers_flash3::flash_fwd 3.43% 153.034us 38.00% 1.693ms 564.339us 0.000us 0.00% 3.819ms 1.273ms 3
3952
+ flash_attn_3::fwd 1.25% 55.902us 34.56% 1.540ms 513.328us 2.852ms 100.00% 3.819ms 1.273ms 3
3953
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.853ms 100.05% 2.853ms 2.853ms 1
3954
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.852ms 100.00% 2.852ms 950.587us 3
3955
+ Activity Buffer Request 31.72% 1.413ms 31.72% 1.413ms 1.413ms 967.259us 33.92% 967.259us 967.259us 1
3956
+ aten::empty 0.68% 30.270us 0.68% 30.270us 5.045us 0.000us 0.00% 0.000us 0.000us 6
3957
+ cudaFuncSetAttribute 0.13% 5.700us 0.13% 5.700us 1.900us 0.000us 0.00% 0.000us 0.000us 3
3958
+ cudaLaunchKernel 0.78% 34.811us 0.78% 34.811us 11.604us 0.000us 0.00% 0.000us 0.000us 3
3959
+ aten::reshape 0.19% 8.522us 0.50% 22.121us 3.687us 0.000us 0.00% 0.000us 0.000us 6
3960
+ aten::view 0.31% 13.599us 0.31% 13.599us 2.266us 0.000us 0.00% 0.000us 0.000us 6
3961
+ cudaDeviceSynchronize 54.36% 2.422ms 54.36% 2.422ms 2.422ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
+ Self CPU time total: 4.455ms
3964
+ Self CUDA time total: 2.852ms
3965
 
3966
 
3967
 
 
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
+ xformers_meff 6.88% 312.747us 44.90% 2.040ms 2.040ms 0.000us 0.00% 3.937ms 3.937ms 1
3975
+ xformers_flash3::flash_fwd 3.35% 152.284us 37.52% 1.705ms 568.205us 0.000us 0.00% 3.937ms 1.312ms 3
3976
+ flash_attn_3::fwd 1.19% 54.281us 34.17% 1.552ms 517.444us 2.934ms 100.00% 3.937ms 1.312ms 3
3977
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.936ms 100.05% 2.936ms 2.936ms 1
3978
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.934ms 100.00% 2.934ms 977.979us 3
3979
+ Activity Buffer Request 31.39% 1.426ms 31.39% 1.426ms 1.426ms 1.003ms 34.19% 1.003ms 1.003ms 1
3980
+ aten::empty 0.67% 30.639us 0.67% 30.639us 5.106us 0.000us 0.00% 0.000us 0.000us 6
3981
+ cudaFuncSetAttribute 0.14% 6.530us 0.14% 6.530us 2.177us 0.000us 0.00% 0.000us 0.000us 3
3982
+ cudaLaunchKernel 0.77% 34.781us 0.77% 34.781us 11.594us 0.000us 0.00% 0.000us 0.000us 3
3983
+ aten::reshape 0.19% 8.650us 0.49% 22.320us 3.720us 0.000us 0.00% 0.000us 0.000us 6
3984
+ aten::view 0.30% 13.670us 0.30% 13.670us 2.278us 0.000us 0.00% 0.000us 0.000us 6
3985
+ cudaDeviceSynchronize 55.10% 2.503ms 55.10% 2.503ms 2.503ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
+ Self CPU time total: 4.543ms
3988
+ Self CUDA time total: 2.934ms
3989
 
3990
 
3991
 
 
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ xformers_meff 6.56% 308.746us 47.29% 2.227ms 2.227ms 0.000us 0.00% 3.897ms 3.897ms 1
3999
+ xformers_flash3::flash_fwd 3.22% 151.743us 40.27% 1.897ms 632.183us 0.000us 0.00% 3.897ms 1.299ms 3
4000
+ flash_attn_3::fwd 1.19% 56.081us 37.05% 1.745ms 581.602us 2.911ms 100.00% 3.897ms 1.299ms 3
4001
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.913ms 100.05% 2.913ms 2.913ms 1
4002
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.911ms 100.00% 2.911ms 970.491us 3
4003
+ Activity Buffer Request 30.05% 1.415ms 30.05% 1.415ms 1.415ms 985.179us 33.84% 985.179us 985.179us 1
4004
+ aten::empty 0.65% 30.820us 0.65% 30.820us 5.137us 0.000us 0.00% 0.000us 0.000us 6
4005
+ cudaFuncSetAttribute 0.13% 6.030us 0.13% 6.030us 2.010us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaLaunchKernel 5.02% 236.645us 5.02% 236.645us 78.882us 0.000us 0.00% 0.000us 0.000us 3
4007
+ aten::reshape 0.18% 8.502us 0.47% 22.111us 3.685us 0.000us 0.00% 0.000us 0.000us 6
4008
+ aten::view 0.29% 13.609us 0.29% 13.609us 2.268us 0.000us 0.00% 0.000us 0.000us 6
4009
+ cudaDeviceSynchronize 52.71% 2.482ms 52.71% 2.482ms 2.482ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
+ Self CPU time total: 4.710ms
4012
+ Self CUDA time total: 2.911ms
4013
 
4014
 
4015
 
 
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ xformers_meff 6.33% 326.758us 43.32% 2.236ms 2.236ms 0.000us 0.00% 4.559ms 4.559ms 1
4023
+ xformers_flash3::flash_fwd 3.59% 185.275us 36.53% 1.885ms 628.414us 0.000us 0.00% 4.559ms 1.520ms 3
4024
+ flash_attn_3::fwd 1.12% 57.990us 32.94% 1.700ms 566.655us 3.412ms 100.00% 4.559ms 1.520ms 3
4025
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.05% 3.413ms 3.413ms 1
4026
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.412ms 100.00% 3.412ms 1.137ms 3
4027
+ Activity Buffer Request 27.43% 1.416ms 27.43% 1.416ms 1.416ms 1.147ms 33.63% 1.147ms 1.147ms 1
4028
+ aten::empty 0.66% 34.131us 0.66% 34.131us 5.688us 0.000us 0.00% 0.000us 0.000us 6
4029
+ cudaFuncSetAttribute 0.12% 6.360us 0.12% 6.360us 2.120us 0.000us 0.00% 0.000us 0.000us 3
4030
+ cudaLaunchKernel 3.60% 185.845us 3.60% 185.845us 61.948us 0.000us 0.00% 0.000us 0.000us 3
4031
+ aten::reshape 0.17% 8.790us 0.46% 23.539us 3.923us 0.000us 0.00% 0.000us 0.000us 6
4032
+ aten::view 0.29% 14.749us 0.29% 14.749us 2.458us 0.000us 0.00% 0.000us 0.000us 6
4033
+ cudaDeviceSynchronize 56.68% 2.925ms 56.68% 2.925ms 2.925ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
+ Self CPU time total: 5.161ms
4036
+ Self CUDA time total: 3.412ms
4037
 
4038
 
4039
 
 
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
+ xformers_meff 6.07% 310.905us 43.25% 2.215ms 2.215ms 0.000us 0.00% 4.499ms 4.499ms 1
4047
+ xformers_flash3::flash_fwd 3.55% 181.844us 36.73% 1.881ms 626.964us 0.000us 0.00% 4.499ms 1.500ms 3
4048
+ flash_attn_3::fwd 1.14% 58.453us 33.18% 1.699ms 566.349us 3.369ms 100.00% 4.499ms 1.500ms 3
4049
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.371ms 100.06% 3.371ms 3.371ms 1
4050
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.369ms 100.00% 3.369ms 1.123ms 3
4051
+ Activity Buffer Request 27.78% 1.423ms 27.78% 1.423ms 1.423ms 1.130ms 33.54% 1.130ms 1.130ms 1
4052
+ aten::empty 0.65% 33.340us 0.65% 33.340us 5.557us 0.000us 0.00% 0.000us 0.000us 6
4053
+ cudaFuncSetAttribute 0.11% 5.670us 0.11% 5.670us 1.890us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaLaunchKernel 3.50% 178.983us 3.50% 178.983us 59.661us 0.000us 0.00% 0.000us 0.000us 3
4055
+ aten::reshape 0.17% 8.671us 0.45% 22.942us 3.824us 0.000us 0.00% 0.000us 0.000us 6
4056
+ aten::view 0.28% 14.271us 0.28% 14.271us 2.378us 0.000us 0.00% 0.000us 0.000us 6
4057
+ cudaDeviceSynchronize 56.75% 2.906ms 56.75% 2.906ms 2.906ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ Self CPU time total: 5.120ms
4060
+ Self CUDA time total: 3.369ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
+ xformers_meff cuda_attn_L128_bfloat16 1.01 True
4065
+ xformers_meff cuda_attn_L256_bfloat16 1.04 True
4066
+ xformers_meff cuda_attn_L320_bfloat16 1.10 True
4067
+ xformers_meff cuda_attn_L384_bfloat16 1.10 True
4068
  xformers_meff cuda_attn_L448_bfloat16 1.24 True
4069
+ xformers_meff cuda_attn_L512_bfloat16 1.24 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4073
  <div class="uv-logs-content" style="display: none;">
4074
  Downloading xformers (111.8MiB)
4075
  Downloading xformers
4076
+ Installed 38 packages in 194ms
4077
  </div>
4078
  </div>
4079
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.svg CHANGED
flash_attn/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T00:37:33.622731</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
- <path d="M 47.81 406.553759 L 835.361742 406.553759 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
- <use ns4:href="#m0fca2865ba" x="47.81" y="406.553759" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="410.352978" transform="rotate(-0 40.81 410.352978)">1.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
- <path d="M 47.81 347.341461 L 835.361742 347.341461 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
- <use ns4:href="#m0fca2865ba" x="47.81" y="347.341461" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.140679" transform="rotate(-0 40.81 351.140679)">1.2</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
- <path d="M 47.81 288.129162 L 835.361742 288.129162 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
- <use ns4:href="#m0fca2865ba" x="47.81" y="288.129162" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="291.928381" transform="rotate(-0 40.81 291.928381)">1.4</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
- <path d="M 47.81 228.916863 L 835.361742 228.916863 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="47.81" y="228.916863" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="232.716082" transform="rotate(-0 40.81 232.716082)">1.6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
- <path d="M 47.81 169.704565 L 835.361742 169.704565 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="47.81" y="169.704565" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="173.503783" transform="rotate(-0 40.81 173.503783)">1.8</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
- <path d="M 47.81 110.492266 L 835.361742 110.492266 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="47.81" y="110.492266" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="114.291485" transform="rotate(-0 40.81 114.291485)">2.0</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_7">
4065
  <g id="grid-y--8" class="grid grid-y">
4066
- <path d="M 47.81 51.279967 L 835.361742 51.279967 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="47.81" y="51.279967" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="55.079186" transform="rotate(-0 40.81 55.079186)">2.2</text>
4075
  </g>
4076
  </g>
4077
  <g id="label--y" class="ylabel">
@@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4079
  </g>
4080
  </g>
4081
  <g id="series--torch-flash-ma" class="series">
4082
- <path d="M 83.607806 345.028628 L 226.799032 327.30017 L 369.990258 321.047055 L 513.181484 312.896186 L 656.37271 268.740095 L 799.563935 260.085921 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4083
  <defs>
4084
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4085
  </defs>
4086
  <g clip-path="url(#p09feef2583)">
4087
- <use ns4:href="#md7efaf3aec" x="83.607806" y="345.028628" style="fill: #1f77b4; stroke: #1f77b4" />
4088
- <use ns4:href="#md7efaf3aec" x="226.799032" y="327.30017" style="fill: #1f77b4; stroke: #1f77b4" />
4089
- <use ns4:href="#md7efaf3aec" x="369.990258" y="321.047055" style="fill: #1f77b4; stroke: #1f77b4" />
4090
- <use ns4:href="#md7efaf3aec" x="513.181484" y="312.896186" style="fill: #1f77b4; stroke: #1f77b4" />
4091
- <use ns4:href="#md7efaf3aec" x="656.37271" y="268.740095" style="fill: #1f77b4; stroke: #1f77b4" />
4092
- <use ns4:href="#md7efaf3aec" x="799.563935" y="260.085921" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="series--torch-mem-eff" class="series">
4096
- <path d="M 83.607806 151.317962 L 226.799032 114.664068 L 369.990258 103.230173 L 513.181484 97.187262 L 656.37271 92.275602 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4097
  <defs>
4098
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4099
  </defs>
4100
  <g clip-path="url(#p09feef2583)">
4101
- <use ns4:href="#m9b8c54d372" x="83.607806" y="151.317962" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
- <use ns4:href="#m9b8c54d372" x="226.799032" y="114.664068" style="fill: #ff7f0e; stroke: #ff7f0e" />
4103
- <use ns4:href="#m9b8c54d372" x="369.990258" y="103.230173" style="fill: #ff7f0e; stroke: #ff7f0e" />
4104
- <use ns4:href="#m9b8c54d372" x="513.181484" y="97.187262" style="fill: #ff7f0e; stroke: #ff7f0e" />
4105
- <use ns4:href="#m9b8c54d372" x="656.37271" y="92.275602" style="fill: #ff7f0e; stroke: #ff7f0e" />
4106
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4107
  </g>
4108
  </g>
4109
  <g id="series--xformers-meff" class="series">
4110
- <path d="M 83.607806 413.264289 L 226.799032 400.648813 L 369.990258 385.184929 L 513.181484 382.185826 L 656.37271 336.27409 L 799.563935 338.219214 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4111
  <defs>
4112
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4113
  </defs>
4114
  <g clip-path="url(#p09feef2583)">
4115
- <use ns4:href="#mc655281e0b" x="83.607806" y="413.264289" style="fill: #2ca02c; stroke: #2ca02c" />
4116
- <use ns4:href="#mc655281e0b" x="226.799032" y="400.648813" style="fill: #2ca02c; stroke: #2ca02c" />
4117
- <use ns4:href="#mc655281e0b" x="369.990258" y="385.184929" style="fill: #2ca02c; stroke: #2ca02c" />
4118
- <use ns4:href="#mc655281e0b" x="513.181484" y="382.185826" style="fill: #2ca02c; stroke: #2ca02c" />
4119
- <use ns4:href="#mc655281e0b" x="656.37271" y="336.27409" style="fill: #2ca02c; stroke: #2ca02c" />
4120
- <use ns4:href="#mc655281e0b" x="799.563935" y="338.219214" style="fill: #2ca02c; stroke: #2ca02c" />
4121
  </g>
4122
  </g>
4123
  <g id="series--hf-kernels-flash-attn" class="series">
4124
- <path d="M 83.607806 422.756317 L 226.799032 410.537563 L 369.990258 393.753541 L 513.181484 389.504466 L 656.37271 345.733551 L 799.563935 345.239128 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4125
  <defs>
4126
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4127
  </defs>
4128
  <g clip-path="url(#p09feef2583)">
4129
- <use ns4:href="#m61c8040d7e" x="83.607806" y="422.756317" style="fill: #d62728; stroke: #d62728" />
4130
- <use ns4:href="#m61c8040d7e" x="226.799032" y="410.537563" style="fill: #d62728; stroke: #d62728" />
4131
- <use ns4:href="#m61c8040d7e" x="369.990258" y="393.753541" style="fill: #d62728; stroke: #d62728" />
4132
- <use ns4:href="#m61c8040d7e" x="513.181484" y="389.504466" style="fill: #d62728; stroke: #d62728" />
4133
- <use ns4:href="#m61c8040d7e" x="656.37271" y="345.733551" style="fill: #d62728; stroke: #d62728" />
4134
- <use ns4:href="#m61c8040d7e" x="799.563935" y="345.239128" style="fill: #d62728; stroke: #d62728" />
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-flash-attn3" class="series">
4138
- <path d="M 83.607806 428.387702 L 226.799032 419.656553 L 369.990258 403.553472 L 513.181484 402.22386 L 656.37271 354.603553 L 799.563935 357.653282 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4141
  </defs>
4142
  <g clip-path="url(#p09feef2583)">
4143
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4144
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="419.656553" style="fill: #9467bd; stroke: #9467bd" />
4145
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="403.553472" style="fill: #9467bd; stroke: #9467bd" />
4146
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="402.22386" style="fill: #9467bd; stroke: #9467bd" />
4147
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="354.603553" style="fill: #9467bd; stroke: #9467bd" />
4148
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="357.653282" style="fill: #9467bd; stroke: #9467bd" />
4149
  </g>
4150
  </g>
4151
  <g id="patch_3">
@@ -4230,7 +4230,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4230
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4231
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4232
  </span> |
4233
- Cell: combine | 4.30s
4234
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4235
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4236
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4338,47 +4338,47 @@ COMBINED BENCHMARK SUMMARY
4338
 
4339
  impl wl p50(ms) ok
4340
  hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4341
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True
4342
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
4343
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4344
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
4345
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
4346
  hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
4347
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
4348
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4349
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4350
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4351
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4352
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4353
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
4354
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4355
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
4356
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4357
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
4358
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4359
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
4360
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4361
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4363
- Error: module &#x27;sage_attention_5f1806f1e6d9e7bd&#x27; has no attribute &#x27;fwd&#x27;
4364
  torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4365
- torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4366
  torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4367
  torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4368
- torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4369
- torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4370
- torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
4371
- torch_mem_eff cuda_attn_L256_bfloat16 1.99 True
4372
- torch_mem_eff cuda_attn_L320_bfloat16 2.02 True
4373
  torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
4374
- torch_mem_eff cuda_attn_L448_bfloat16 2.06 True
4375
- torch_mem_eff cuda_attn_L512_bfloat16 2.22 True
4376
- xformers_meff cuda_attn_L128_bfloat16 0.98 True
4377
- xformers_meff cuda_attn_L256_bfloat16 1.02 True
4378
- xformers_meff cuda_attn_L320_bfloat16 1.07 True
4379
- xformers_meff cuda_attn_L384_bfloat16 1.08 True
4380
  xformers_meff cuda_attn_L448_bfloat16 1.24 True
4381
- xformers_meff cuda_attn_L512_bfloat16 1.23 True
4382
 
4383
  GENERATING COMBINED VISUALIZATION
4384
 
@@ -4402,7 +4402,7 @@ Implementations included:
4402
  <div class="uv-install-logs" id="uv-logs-combine">
4403
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4404
  <div class="uv-logs-content" style="display: none;">
4405
- Installed 37 packages in 221ms
4406
  </div>
4407
  </div>
4408
  <div class="cell-artifacts">
@@ -4415,7 +4415,7 @@ Installed 37 packages in 221ms
4415
  <rdf:RDF>
4416
  <ns2:Work>
4417
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4418
- <dc:date>2025-10-29T00:37:33.622731</dc:date>
4419
  <dc:format>image/svg+xml</dc:format>
4420
  <dc:creator>
4421
  <ns2:Agent>
@@ -4525,96 +4525,96 @@ Installed 37 packages in 221ms
4525
  <g id="matplotlib.axis_2">
4526
  <g id="ytick_1">
4527
  <g id="grid-y--2" class="grid grid-y">
4528
- <path d="M 47.81 406.553759 L 835.361742 406.553759 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_7">
4531
  <defs>
4532
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4533
  </defs>
4534
  <g>
4535
- <use ns4:href="#m0fca2865ba" x="47.81" y="406.553759" style="stroke: #000000; stroke-width: 0.8" />
4536
  </g>
4537
  </g>
4538
  <g id="text_7">
4539
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="410.352978" transform="rotate(-0 40.81 410.352978)">1.0</text>
4540
  </g>
4541
  </g>
4542
  <g id="ytick_2">
4543
  <g id="grid-y--3" class="grid grid-y">
4544
- <path d="M 47.81 347.341461 L 835.361742 347.341461 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4545
  </g>
4546
  <g id="line2d_8">
4547
  <g>
4548
- <use ns4:href="#m0fca2865ba" x="47.81" y="347.341461" style="stroke: #000000; stroke-width: 0.8" />
4549
  </g>
4550
  </g>
4551
  <g id="text_8">
4552
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="351.140679" transform="rotate(-0 40.81 351.140679)">1.2</text>
4553
  </g>
4554
  </g>
4555
  <g id="ytick_3">
4556
  <g id="grid-y--4" class="grid grid-y">
4557
- <path d="M 47.81 288.129162 L 835.361742 288.129162 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4558
  </g>
4559
  <g id="line2d_9">
4560
  <g>
4561
- <use ns4:href="#m0fca2865ba" x="47.81" y="288.129162" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_9">
4565
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="291.928381" transform="rotate(-0 40.81 291.928381)">1.4</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_4">
4569
  <g id="grid-y--5" class="grid grid-y">
4570
- <path d="M 47.81 228.916863 L 835.361742 228.916863 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_10">
4573
  <g>
4574
- <use ns4:href="#m0fca2865ba" x="47.81" y="228.916863" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_10">
4578
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="232.716082" transform="rotate(-0 40.81 232.716082)">1.6</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_5">
4582
  <g id="grid-y--6" class="grid grid-y">
4583
- <path d="M 47.81 169.704565 L 835.361742 169.704565 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_11">
4586
  <g>
4587
- <use ns4:href="#m0fca2865ba" x="47.81" y="169.704565" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_11">
4591
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="173.503783" transform="rotate(-0 40.81 173.503783)">1.8</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_6">
4595
  <g id="grid-y--7" class="grid grid-y">
4596
- <path d="M 47.81 110.492266 L 835.361742 110.492266 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_12">
4599
  <g>
4600
- <use ns4:href="#m0fca2865ba" x="47.81" y="110.492266" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_12">
4604
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="114.291485" transform="rotate(-0 40.81 114.291485)">2.0</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_7">
4608
  <g id="grid-y--8" class="grid grid-y">
4609
- <path d="M 47.81 51.279967 L 835.361742 51.279967 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_13">
4612
  <g>
4613
- <use ns4:href="#m0fca2865ba" x="47.81" y="51.279967" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_13">
4617
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="55.079186" transform="rotate(-0 40.81 55.079186)">2.2</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
@@ -4622,73 +4622,73 @@ Installed 37 packages in 221ms
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-flash-ma" class="series">
4625
- <path d="M 83.607806 345.028628 L 226.799032 327.30017 L 369.990258 321.047055 L 513.181484 312.896186 L 656.37271 268.740095 L 799.563935 260.085921 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p09feef2583)">
4630
- <use ns4:href="#md7efaf3aec" x="83.607806" y="345.028628" style="fill: #1f77b4; stroke: #1f77b4" />
4631
- <use ns4:href="#md7efaf3aec" x="226.799032" y="327.30017" style="fill: #1f77b4; stroke: #1f77b4" />
4632
- <use ns4:href="#md7efaf3aec" x="369.990258" y="321.047055" style="fill: #1f77b4; stroke: #1f77b4" />
4633
- <use ns4:href="#md7efaf3aec" x="513.181484" y="312.896186" style="fill: #1f77b4; stroke: #1f77b4" />
4634
- <use ns4:href="#md7efaf3aec" x="656.37271" y="268.740095" style="fill: #1f77b4; stroke: #1f77b4" />
4635
- <use ns4:href="#md7efaf3aec" x="799.563935" y="260.085921" style="fill: #1f77b4; stroke: #1f77b4" />
4636
  </g>
4637
  </g>
4638
  <g id="series--torch-mem-eff" class="series">
4639
- <path d="M 83.607806 151.317962 L 226.799032 114.664068 L 369.990258 103.230173 L 513.181484 97.187262 L 656.37271 92.275602 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4640
  <defs>
4641
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4642
  </defs>
4643
  <g clip-path="url(#p09feef2583)">
4644
- <use ns4:href="#m9b8c54d372" x="83.607806" y="151.317962" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
- <use ns4:href="#m9b8c54d372" x="226.799032" y="114.664068" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
- <use ns4:href="#m9b8c54d372" x="369.990258" y="103.230173" style="fill: #ff7f0e; stroke: #ff7f0e" />
4647
- <use ns4:href="#m9b8c54d372" x="513.181484" y="97.187262" style="fill: #ff7f0e; stroke: #ff7f0e" />
4648
- <use ns4:href="#m9b8c54d372" x="656.37271" y="92.275602" style="fill: #ff7f0e; stroke: #ff7f0e" />
4649
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4650
  </g>
4651
  </g>
4652
  <g id="series--xformers-meff" class="series">
4653
- <path d="M 83.607806 413.264289 L 226.799032 400.648813 L 369.990258 385.184929 L 513.181484 382.185826 L 656.37271 336.27409 L 799.563935 338.219214 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4654
  <defs>
4655
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4656
  </defs>
4657
  <g clip-path="url(#p09feef2583)">
4658
- <use ns4:href="#mc655281e0b" x="83.607806" y="413.264289" style="fill: #2ca02c; stroke: #2ca02c" />
4659
- <use ns4:href="#mc655281e0b" x="226.799032" y="400.648813" style="fill: #2ca02c; stroke: #2ca02c" />
4660
- <use ns4:href="#mc655281e0b" x="369.990258" y="385.184929" style="fill: #2ca02c; stroke: #2ca02c" />
4661
- <use ns4:href="#mc655281e0b" x="513.181484" y="382.185826" style="fill: #2ca02c; stroke: #2ca02c" />
4662
- <use ns4:href="#mc655281e0b" x="656.37271" y="336.27409" style="fill: #2ca02c; stroke: #2ca02c" />
4663
- <use ns4:href="#mc655281e0b" x="799.563935" y="338.219214" style="fill: #2ca02c; stroke: #2ca02c" />
4664
  </g>
4665
  </g>
4666
  <g id="series--hf-kernels-flash-attn" class="series">
4667
- <path d="M 83.607806 422.756317 L 226.799032 410.537563 L 369.990258 393.753541 L 513.181484 389.504466 L 656.37271 345.733551 L 799.563935 345.239128 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4668
  <defs>
4669
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4670
  </defs>
4671
  <g clip-path="url(#p09feef2583)">
4672
- <use ns4:href="#m61c8040d7e" x="83.607806" y="422.756317" style="fill: #d62728; stroke: #d62728" />
4673
- <use ns4:href="#m61c8040d7e" x="226.799032" y="410.537563" style="fill: #d62728; stroke: #d62728" />
4674
- <use ns4:href="#m61c8040d7e" x="369.990258" y="393.753541" style="fill: #d62728; stroke: #d62728" />
4675
- <use ns4:href="#m61c8040d7e" x="513.181484" y="389.504466" style="fill: #d62728; stroke: #d62728" />
4676
- <use ns4:href="#m61c8040d7e" x="656.37271" y="345.733551" style="fill: #d62728; stroke: #d62728" />
4677
- <use ns4:href="#m61c8040d7e" x="799.563935" y="345.239128" style="fill: #d62728; stroke: #d62728" />
4678
  </g>
4679
  </g>
4680
  <g id="series--hf-kernels-flash-attn3" class="series">
4681
- <path d="M 83.607806 428.387702 L 226.799032 419.656553 L 369.990258 403.553472 L 513.181484 402.22386 L 656.37271 354.603553 L 799.563935 357.653282 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4682
  <defs>
4683
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4684
  </defs>
4685
  <g clip-path="url(#p09feef2583)">
4686
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4687
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="419.656553" style="fill: #9467bd; stroke: #9467bd" />
4688
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="403.553472" style="fill: #9467bd; stroke: #9467bd" />
4689
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="402.22386" style="fill: #9467bd; stroke: #9467bd" />
4690
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="354.603553" style="fill: #9467bd; stroke: #9467bd" />
4691
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="357.653282" style="fill: #9467bd; stroke: #9467bd" />
4692
  </g>
4693
  </g>
4694
  <g id="patch_3">
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T04:14:54.057236</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
+ <path d="M 47.81 405.733213 L 835.361742 405.733213 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
+ <use ns4:href="#m0fca2865ba" x="47.81" y="405.733213" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="409.532432" transform="rotate(-0 40.81 409.532432)">1.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
+ <path d="M 47.81 344.636964 L 835.361742 344.636964 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
+ <use ns4:href="#m0fca2865ba" x="47.81" y="344.636964" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="348.436183" transform="rotate(-0 40.81 348.436183)">1.2</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
+ <path d="M 47.81 283.540715 L 835.361742 283.540715 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
+ <use ns4:href="#m0fca2865ba" x="47.81" y="283.540715" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.339933" transform="rotate(-0 40.81 287.339933)">1.4</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
+ <path d="M 47.81 222.444466 L 835.361742 222.444466 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
+ <use ns4:href="#m0fca2865ba" x="47.81" y="222.444466" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="226.243684" transform="rotate(-0 40.81 226.243684)">1.6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
+ <path d="M 47.81 161.348216 L 835.361742 161.348216 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
+ <use ns4:href="#m0fca2865ba" x="47.81" y="161.348216" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="165.147435" transform="rotate(-0 40.81 165.147435)">1.8</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
+ <path d="M 47.81 100.251967 L 835.361742 100.251967 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
+ <use ns4:href="#m0fca2865ba" x="47.81" y="100.251967" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="104.051186" transform="rotate(-0 40.81 104.051186)">2.0</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_7">
4065
  <g id="grid-y--8" class="grid grid-y">
4066
+ <path d="M 47.81 39.155718 L 835.361742 39.155718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
+ <use ns4:href="#m0fca2865ba" x="47.81" y="39.155718" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="42.954937" transform="rotate(-0 40.81 42.954937)">2.2</text>
4075
  </g>
4076
  </g>
4077
  <g id="label--y" class="ylabel">
 
4079
  </g>
4080
  </g>
4081
  <g id="series--torch-flash-ma" class="series">
4082
+ <path d="M 83.607806 340.364503 L 226.799032 326.486185 L 369.990258 317.620813 L 513.181484 308.266672 L 656.37271 260.522092 L 799.563935 250.419827 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4083
  <defs>
4084
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4085
  </defs>
4086
  <g clip-path="url(#p09feef2583)">
4087
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="340.364503" style="fill: #1f77b4; stroke: #1f77b4" />
4088
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="326.486185" style="fill: #1f77b4; stroke: #1f77b4" />
4089
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="317.620813" style="fill: #1f77b4; stroke: #1f77b4" />
4090
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="308.266672" style="fill: #1f77b4; stroke: #1f77b4" />
4091
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="260.522092" style="fill: #1f77b4; stroke: #1f77b4" />
4092
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="250.419827" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="series--torch-mem-eff" class="series">
4096
+ <path d="M 83.607806 150.170658 L 226.799032 128.245658 L 369.990258 112.32367 L 513.181484 87.053955 L 656.37271 71.037268 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4097
  <defs>
4098
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4099
  </defs>
4100
  <g clip-path="url(#p09feef2583)">
4101
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="150.170658" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="128.245658" style="fill: #ff7f0e; stroke: #ff7f0e" />
4103
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="112.32367" style="fill: #ff7f0e; stroke: #ff7f0e" />
4104
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="87.053955" style="fill: #ff7f0e; stroke: #ff7f0e" />
4105
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="71.037268" style="fill: #ff7f0e; stroke: #ff7f0e" />
4106
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4107
  </g>
4108
  </g>
4109
  <g id="series--xformers-meff" class="series">
4110
+ <path d="M 83.607806 403.792796 L 226.799032 392.954016 L 369.990258 376.494687 L 513.181484 374.771467 L 656.37271 331.266966 L 799.563935 332.076491 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4111
  <defs>
4112
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4113
  </defs>
4114
  <g clip-path="url(#p09feef2583)">
4115
+ <use ns4:href="#mc655281e0b" x="83.607806" y="403.792796" style="fill: #2ca02c; stroke: #2ca02c" />
4116
+ <use ns4:href="#mc655281e0b" x="226.799032" y="392.954016" style="fill: #2ca02c; stroke: #2ca02c" />
4117
+ <use ns4:href="#mc655281e0b" x="369.990258" y="376.494687" style="fill: #2ca02c; stroke: #2ca02c" />
4118
+ <use ns4:href="#mc655281e0b" x="513.181484" y="374.771467" style="fill: #2ca02c; stroke: #2ca02c" />
4119
+ <use ns4:href="#mc655281e0b" x="656.37271" y="331.266966" style="fill: #2ca02c; stroke: #2ca02c" />
4120
+ <use ns4:href="#mc655281e0b" x="799.563935" y="332.076491" style="fill: #2ca02c; stroke: #2ca02c" />
4121
  </g>
4122
  </g>
4123
  <g id="series--hf-kernels-flash-attn" class="series">
4124
+ <path d="M 83.607806 420.124435 L 226.799032 406.579091 L 369.990258 388.286263 L 513.181484 382.130816 L 656.37271 337.645415 L 799.563935 338.021157 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4125
  <defs>
4126
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4127
  </defs>
4128
  <g clip-path="url(#p09feef2583)">
4129
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="420.124435" style="fill: #d62728; stroke: #d62728" />
4130
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="406.579091" style="fill: #d62728; stroke: #d62728" />
4131
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="388.286263" style="fill: #d62728; stroke: #d62728" />
4132
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="382.130816" style="fill: #d62728; stroke: #d62728" />
4133
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="337.645415" style="fill: #d62728; stroke: #d62728" />
4134
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="338.021157" style="fill: #d62728; stroke: #d62728" />
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-flash-attn3" class="series">
4138
+ <path d="M 83.607806 428.387702 L 226.799032 415.233374 L 369.990258 396.467356 L 513.181484 398.1139 L 656.37271 345.939841 L 799.563935 352.101398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4141
  </defs>
4142
  <g clip-path="url(#p09feef2583)">
4143
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4144
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="415.233374" style="fill: #9467bd; stroke: #9467bd" />
4145
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.467356" style="fill: #9467bd; stroke: #9467bd" />
4146
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="398.1139" style="fill: #9467bd; stroke: #9467bd" />
4147
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="345.939841" style="fill: #9467bd; stroke: #9467bd" />
4148
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="352.101398" style="fill: #9467bd; stroke: #9467bd" />
4149
  </g>
4150
  </g>
4151
  <g id="patch_3">
 
4230
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4231
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4232
  </span> |
4233
+ Cell: combine | 4.23s
4234
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4235
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4236
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4338
 
4339
  impl wl p50(ms) ok
4340
  hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4341
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4342
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4343
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
4344
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4345
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
4346
  hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
4347
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.97 True
4348
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
4349
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4350
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.20 True
4351
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4352
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4353
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
4354
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4355
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
4356
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4357
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
4358
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4359
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
4360
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4361
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4363
+ Error: module &#x27;sage_attention_717bd9367b3cdd60&#x27; has no attribute &#x27;fwd&#x27;
4364
  torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4365
+ torch_flash_ma cuda_attn_L256_bfloat16 1.26 True
4366
  torch_flash_ma cuda_attn_L320_bfloat16 1.29 True
4367
  torch_flash_ma cuda_attn_L384_bfloat16 1.32 True
4368
+ torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4369
+ torch_flash_ma cuda_attn_L512_bfloat16 1.51 True
4370
+ torch_mem_eff cuda_attn_L128_bfloat16 1.84 True
4371
+ torch_mem_eff cuda_attn_L256_bfloat16 1.91 True
4372
+ torch_mem_eff cuda_attn_L320_bfloat16 1.96 True
4373
  torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
4374
+ torch_mem_eff cuda_attn_L448_bfloat16 2.10 True
4375
+ torch_mem_eff cuda_attn_L512_bfloat16 2.18 True
4376
+ xformers_meff cuda_attn_L128_bfloat16 1.01 True
4377
+ xformers_meff cuda_attn_L256_bfloat16 1.04 True
4378
+ xformers_meff cuda_attn_L320_bfloat16 1.10 True
4379
+ xformers_meff cuda_attn_L384_bfloat16 1.10 True
4380
  xformers_meff cuda_attn_L448_bfloat16 1.24 True
4381
+ xformers_meff cuda_attn_L512_bfloat16 1.24 True
4382
 
4383
  GENERATING COMBINED VISUALIZATION
4384
 
 
4402
  <div class="uv-install-logs" id="uv-logs-combine">
4403
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4404
  <div class="uv-logs-content" style="display: none;">
4405
+ Installed 37 packages in 199ms
4406
  </div>
4407
  </div>
4408
  <div class="cell-artifacts">
 
4415
  <rdf:RDF>
4416
  <ns2:Work>
4417
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4418
+ <dc:date>2025-10-29T04:14:54.057236</dc:date>
4419
  <dc:format>image/svg+xml</dc:format>
4420
  <dc:creator>
4421
  <ns2:Agent>
 
4525
  <g id="matplotlib.axis_2">
4526
  <g id="ytick_1">
4527
  <g id="grid-y--2" class="grid grid-y">
4528
+ <path d="M 47.81 405.733213 L 835.361742 405.733213 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_7">
4531
  <defs>
4532
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4533
  </defs>
4534
  <g>
4535
+ <use ns4:href="#m0fca2865ba" x="47.81" y="405.733213" style="stroke: #000000; stroke-width: 0.8" />
4536
  </g>
4537
  </g>
4538
  <g id="text_7">
4539
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="409.532432" transform="rotate(-0 40.81 409.532432)">1.0</text>
4540
  </g>
4541
  </g>
4542
  <g id="ytick_2">
4543
  <g id="grid-y--3" class="grid grid-y">
4544
+ <path d="M 47.81 344.636964 L 835.361742 344.636964 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4545
  </g>
4546
  <g id="line2d_8">
4547
  <g>
4548
+ <use ns4:href="#m0fca2865ba" x="47.81" y="344.636964" style="stroke: #000000; stroke-width: 0.8" />
4549
  </g>
4550
  </g>
4551
  <g id="text_8">
4552
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="348.436183" transform="rotate(-0 40.81 348.436183)">1.2</text>
4553
  </g>
4554
  </g>
4555
  <g id="ytick_3">
4556
  <g id="grid-y--4" class="grid grid-y">
4557
+ <path d="M 47.81 283.540715 L 835.361742 283.540715 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4558
  </g>
4559
  <g id="line2d_9">
4560
  <g>
4561
+ <use ns4:href="#m0fca2865ba" x="47.81" y="283.540715" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_9">
4565
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.339933" transform="rotate(-0 40.81 287.339933)">1.4</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_4">
4569
  <g id="grid-y--5" class="grid grid-y">
4570
+ <path d="M 47.81 222.444466 L 835.361742 222.444466 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_10">
4573
  <g>
4574
+ <use ns4:href="#m0fca2865ba" x="47.81" y="222.444466" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_10">
4578
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="226.243684" transform="rotate(-0 40.81 226.243684)">1.6</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_5">
4582
  <g id="grid-y--6" class="grid grid-y">
4583
+ <path d="M 47.81 161.348216 L 835.361742 161.348216 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_11">
4586
  <g>
4587
+ <use ns4:href="#m0fca2865ba" x="47.81" y="161.348216" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_11">
4591
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="165.147435" transform="rotate(-0 40.81 165.147435)">1.8</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_6">
4595
  <g id="grid-y--7" class="grid grid-y">
4596
+ <path d="M 47.81 100.251967 L 835.361742 100.251967 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_12">
4599
  <g>
4600
+ <use ns4:href="#m0fca2865ba" x="47.81" y="100.251967" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_12">
4604
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="104.051186" transform="rotate(-0 40.81 104.051186)">2.0</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_7">
4608
  <g id="grid-y--8" class="grid grid-y">
4609
+ <path d="M 47.81 39.155718 L 835.361742 39.155718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_13">
4612
  <g>
4613
+ <use ns4:href="#m0fca2865ba" x="47.81" y="39.155718" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_13">
4617
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="42.954937" transform="rotate(-0 40.81 42.954937)">2.2</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
 
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-flash-ma" class="series">
4625
+ <path d="M 83.607806 340.364503 L 226.799032 326.486185 L 369.990258 317.620813 L 513.181484 308.266672 L 656.37271 260.522092 L 799.563935 250.419827 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p09feef2583)">
4630
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="340.364503" style="fill: #1f77b4; stroke: #1f77b4" />
4631
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="326.486185" style="fill: #1f77b4; stroke: #1f77b4" />
4632
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="317.620813" style="fill: #1f77b4; stroke: #1f77b4" />
4633
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="308.266672" style="fill: #1f77b4; stroke: #1f77b4" />
4634
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="260.522092" style="fill: #1f77b4; stroke: #1f77b4" />
4635
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="250.419827" style="fill: #1f77b4; stroke: #1f77b4" />
4636
  </g>
4637
  </g>
4638
  <g id="series--torch-mem-eff" class="series">
4639
+ <path d="M 83.607806 150.170658 L 226.799032 128.245658 L 369.990258 112.32367 L 513.181484 87.053955 L 656.37271 71.037268 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4640
  <defs>
4641
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4642
  </defs>
4643
  <g clip-path="url(#p09feef2583)">
4644
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="150.170658" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="128.245658" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="112.32367" style="fill: #ff7f0e; stroke: #ff7f0e" />
4647
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="87.053955" style="fill: #ff7f0e; stroke: #ff7f0e" />
4648
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="71.037268" style="fill: #ff7f0e; stroke: #ff7f0e" />
4649
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4650
  </g>
4651
  </g>
4652
  <g id="series--xformers-meff" class="series">
4653
+ <path d="M 83.607806 403.792796 L 226.799032 392.954016 L 369.990258 376.494687 L 513.181484 374.771467 L 656.37271 331.266966 L 799.563935 332.076491 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4654
  <defs>
4655
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4656
  </defs>
4657
  <g clip-path="url(#p09feef2583)">
4658
+ <use ns4:href="#mc655281e0b" x="83.607806" y="403.792796" style="fill: #2ca02c; stroke: #2ca02c" />
4659
+ <use ns4:href="#mc655281e0b" x="226.799032" y="392.954016" style="fill: #2ca02c; stroke: #2ca02c" />
4660
+ <use ns4:href="#mc655281e0b" x="369.990258" y="376.494687" style="fill: #2ca02c; stroke: #2ca02c" />
4661
+ <use ns4:href="#mc655281e0b" x="513.181484" y="374.771467" style="fill: #2ca02c; stroke: #2ca02c" />
4662
+ <use ns4:href="#mc655281e0b" x="656.37271" y="331.266966" style="fill: #2ca02c; stroke: #2ca02c" />
4663
+ <use ns4:href="#mc655281e0b" x="799.563935" y="332.076491" style="fill: #2ca02c; stroke: #2ca02c" />
4664
  </g>
4665
  </g>
4666
  <g id="series--hf-kernels-flash-attn" class="series">
4667
+ <path d="M 83.607806 420.124435 L 226.799032 406.579091 L 369.990258 388.286263 L 513.181484 382.130816 L 656.37271 337.645415 L 799.563935 338.021157 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4668
  <defs>
4669
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4670
  </defs>
4671
  <g clip-path="url(#p09feef2583)">
4672
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="420.124435" style="fill: #d62728; stroke: #d62728" />
4673
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="406.579091" style="fill: #d62728; stroke: #d62728" />
4674
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="388.286263" style="fill: #d62728; stroke: #d62728" />
4675
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="382.130816" style="fill: #d62728; stroke: #d62728" />
4676
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="337.645415" style="fill: #d62728; stroke: #d62728" />
4677
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="338.021157" style="fill: #d62728; stroke: #d62728" />
4678
  </g>
4679
  </g>
4680
  <g id="series--hf-kernels-flash-attn3" class="series">
4681
+ <path d="M 83.607806 428.387702 L 226.799032 415.233374 L 369.990258 396.467356 L 513.181484 398.1139 L 656.37271 345.939841 L 799.563935 352.101398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4682
  <defs>
4683
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4684
  </defs>
4685
  <g clip-path="url(#p09feef2583)">
4686
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4687
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="415.233374" style="fill: #9467bd; stroke: #9467bd" />
4688
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="396.467356" style="fill: #9467bd; stroke: #9467bd" />
4689
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="398.1139" style="fill: #9467bd; stroke: #9467bd" />
4690
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="345.939841" style="fill: #9467bd; stroke: #9467bd" />
4691
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="352.101398" style="fill: #9467bd; stroke: #9467bd" />
4692
  </g>
4693
  </g>
4694
  <g id="patch_3">
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8265980000032869, "p50": 0.8294890000115629, "p90": 0.8318879999933415, "mean": 0.8305783999958294, "iqr": 0.0024899999857552757, "raw_times": [0.8318879999933415, 0.8294890000115629, 0.8293980000075862, 0.8355189999633694, 0.8265980000032869], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8372490000283506, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
- {"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6484859999650325, "p50": 1.6553460000068299, "p90": 1.6562569999791776, "mean": 1.654196599986335, "iqr": 0.004349999983332964, "raw_times": [1.6589869999847906, 1.6484859999650325, 1.6553460000068299, 1.6519069999958447, 1.6562569999791776], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6548570000054497, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
- {"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6374860000496483, "p50": 1.6479959999742277, "p90": 1.650296000036633, "mean": 1.6462442000261035, "iqr": 0.007159000006140559, "raw_times": [1.6479959999742277, 1.6374860000496483, 1.6523060000395162, 1.6431370000304923, 1.650296000036633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.658577000000605, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
- {"ts": "2025-10-29T00:37:05Z", "run": "61768a8c1365453ebb762f9da29e2af1", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2406110000238186, "p50": 3.2579909999981282, "p90": 3.259831999969265, "mean": 3.2558895999954984, "iqr": 0.00626999997166422, "raw_times": [3.259831999969265, 3.2579909999981282, 3.2674519999886797, 3.2535619999976007, 3.2406110000238186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2579709999822626, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
 
1
+ {"ts": "2025-10-29T04:14:34Z", "run": "fe58e781071b44039fe2ff8652618fab", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8066769999572898, "p50": 0.8187079999970592, "p90": 0.8202469999787354, "mean": 0.8162713999922744, "iqr": 0.004819999958272092, "raw_times": [0.8066769999572898, 0.8187079999970592, 0.8202980000078242, 0.8202469999787354, 0.8154270000204633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8278980000113734, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
+ {"ts": "2025-10-29T04:14:34Z", "run": "fe58e781071b44039fe2ff8652618fab", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6742259999773523, "p50": 1.6819360000113193, "p90": 1.682725999955892, "mean": 1.6819601999827682, "iqr": 0.0012589999869305757, "raw_times": [1.6814669999689613, 1.6742259999773523, 1.6819360000113193, 1.6894460000003164, 1.682725999955892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6828459999942424, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
+ {"ts": "2025-10-29T04:14:35Z", "run": "fe58e781071b44039fe2ff8652618fab", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6012950000003912, "p50": 1.6123539999739478, "p90": 1.612534000003052, "mean": 1.6096803999971598, "iqr": 0.005670000007285125, "raw_times": [1.6068639999957668, 1.612534000003052, 1.6123539999739478, 1.6012950000003912, 1.6153550000126415], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6159039999479319, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
+ {"ts": "2025-10-29T04:14:35Z", "run": "fe58e781071b44039fe2ff8652618fab", "impl": "torch_layer_norm", "tags": {"family": "torch", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.330611000023964, "p50": 3.334851999966304, "p90": 3.3351920000086466, "mean": 3.3337277999976322, "iqr": 0.003470000024208275, "raw_times": [3.3351920000086466, 3.330611000023964, 3.3317219999844383, 3.336262000004808, 3.334851999966304], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.3335720000309266, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
layer_norm/impls/cells/benchmark.py CHANGED
@@ -3,7 +3,6 @@
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
6
- # "kernels",
7
  # "kernels-benchmark-tools",
8
  # ]
9
  #
@@ -13,37 +12,15 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the layer norm kernel
19
- layer_norm_kernel = get_kernel("kernels-community/layer-norm")
20
 
21
-
22
- def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
23
- B, S, D = x.shape
24
- # The kernel expects [N, D] input; support beta (bias) if provided.
25
- out = layer_norm_kernel.dropout_add_ln_fwd(
26
- input=x.view(-1, D),
27
- gamma=weight,
28
- beta=bias,
29
- rowscale=None,
30
- colscale=None,
31
- x0_subset=None,
32
- z_subset=None,
33
- dropout_p=0.0,
34
- epsilon=eps,
35
- rowscale_const=1.0,
36
- z_numrows=S,
37
- gen=None,
38
- residual_in_fp32=False,
39
- is_rms_norm=False,
40
- )[0].view(B, S, D)
41
- return out
42
 
43
 
44
  run_benchmark(
45
  kernel_type=KernelTypeEnum.LAYER_NORM,
46
- impl_name="hf_kernels_layer_norm",
47
- impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
48
- impl_func=hf_kernels_layer_norm,
49
  )
 
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
 
6
  # "kernels-benchmark-tools",
7
  # ]
8
  #
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
+ def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
18
+ return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  run_benchmark(
22
  kernel_type=KernelTypeEnum.LAYER_NORM,
23
+ impl_name="torch_layer_norm",
24
+ impl_tags={"family": "torch", "op": "layer_norm"},
25
+ impl_func=torch_layer_norm,
26
  )
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3873
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3874
  </span> |
3875
- Cell: benchmark | 6.10s
3876
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3877
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3878
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3943,19 +3943,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
- hf_kernels_layer_norm 4.63% 185.406us 46.16% 1.847ms 1.847ms 0.000us 0.00% 3.120ms 3.120ms 1
3947
- _layer_norm_f8ec252::dropout_add_ln_fwd 1.69% 67.562us 40.98% 1.640ms 546.562us 2.384ms 100.00% 3.120ms 1.040ms 3
3948
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.385ms 100.06% 2.385ms 2.385ms 1
3949
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.384ms 100.00% 2.384ms 794.642us 3
3950
- Activity Buffer Request 36.92% 1.477ms 36.92% 1.477ms 1.477ms 735.676us 30.86% 735.676us 735.676us 1
3951
- aten::view 0.54% 21.751us 0.54% 21.751us 3.625us 0.000us 0.00% 0.000us 0.000us 6
3952
- aten::empty 1.11% 44.581us 1.11% 44.581us 4.953us 0.000us 0.00% 0.000us 0.000us 9
3953
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.360us 0.23% 9.360us 3.120us 0.000us 0.00% 0.000us 0.000us 3
3954
- cudaLaunchKernel 1.03% 41.042us 1.03% 41.042us 13.681us 0.000us 0.00% 0.000us 0.000us 3
3955
- cudaDeviceSynchronize 53.84% 2.154ms 53.84% 2.154ms 2.154ms 0.000us 0.00% 0.000us 0.000us 1
3956
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3957
- Self CPU time total: 4.001ms
3958
- Self CUDA time total: 2.384ms
3959
 
3960
 
3961
 
@@ -3965,19 +3965,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
- hf_kernels_layer_norm 2.29% 145.447us 26.95% 1.711ms 1.711ms 0.000us 0.00% 6.386ms 6.386ms 1
3969
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 47.652us 24.47% 1.553ms 517.784us 4.812ms 100.00% 6.386ms 2.129ms 3
3970
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.814ms 100.03% 4.814ms 4.814ms 1
3971
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.812ms 100.00% 4.812ms 1.604ms 3
3972
- Activity Buffer Request 22.77% 1.446ms 22.77% 1.446ms 1.446ms 1.574ms 32.71% 1.574ms 1.574ms 1
3973
- aten::view 0.19% 11.759us 0.19% 11.759us 1.960us 0.000us 0.00% 0.000us 0.000us 6
3974
- aten::empty 0.46% 29.151us 0.46% 29.151us 3.239us 0.000us 0.00% 0.000us 0.000us 9
3975
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.860us 0.08% 4.860us 1.620us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaLaunchKernel 0.41% 26.131us 0.41% 26.131us 8.710us 0.000us 0.00% 0.000us 0.000us 3
3977
- cudaDeviceSynchronize 73.05% 4.638ms 73.05% 4.638ms 4.638ms 0.000us 0.00% 0.000us 0.000us 1
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
- Self CPU time total: 6.348ms
3980
- Self CUDA time total: 4.812ms
3981
 
3982
 
3983
 
@@ -3987,19 +3987,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- hf_kernels_layer_norm 2.00% 126.827us 27.00% 1.712ms 1.712ms 0.000us 0.00% 6.353ms 6.353ms 1
3991
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.76% 48.491us 24.80% 1.572ms 524.088us 4.792ms 100.00% 6.353ms 2.118ms 3
3992
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.793ms 100.03% 4.793ms 4.793ms 1
3993
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.792ms 100.00% 4.792ms 1.597ms 3
3994
- Activity Buffer Request 23.05% 1.462ms 23.05% 1.462ms 1.462ms 1.561ms 32.58% 1.561ms 1.561ms 1
3995
- aten::view 0.20% 12.869us 0.20% 12.869us 2.145us 0.000us 0.00% 0.000us 0.000us 6
3996
- aten::empty 0.48% 30.222us 0.48% 30.222us 3.358us 0.000us 0.00% 0.000us 0.000us 9
3997
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.090us 0.08% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3
3998
- cudaLaunchKernel 0.42% 26.901us 0.42% 26.901us 8.967us 0.000us 0.00% 0.000us 0.000us 3
3999
- cudaDeviceSynchronize 73.00% 4.628ms 73.00% 4.628ms 4.628ms 0.000us 0.00% 0.000us 0.000us 1
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
- Self CPU time total: 6.340ms
4002
- Self CUDA time total: 4.792ms
4003
 
4004
 
4005
 
@@ -4009,37 +4009,36 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4011
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
- hf_kernels_layer_norm 1.24% 144.853us 19.15% 2.240ms 2.240ms 0.000us 0.00% 12.815ms 12.815ms 1
4013
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.39% 45.741us 17.80% 2.083ms 694.211us 9.628ms 100.00% 12.815ms 4.272ms 3
4014
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.629ms 100.01% 9.629ms 9.629ms 1
4015
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.628ms 100.00% 9.628ms 3.209ms 3
4016
- Activity Buffer Request 14.62% 1.710ms 14.62% 1.710ms 1.710ms 3.188ms 33.11% 3.188ms 3.188ms 1
4017
- aten::view 0.11% 12.972us 0.11% 12.972us 2.162us 0.000us 0.00% 0.000us 0.000us 6
4018
- aten::empty 0.26% 30.501us 0.26% 30.501us 3.389us 0.000us 0.00% 0.000us 0.000us 9
4019
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 5.220us 0.04% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaLaunchKernel 2.49% 291.291us 2.49% 291.291us 97.097us 0.000us 0.00% 0.000us 0.000us 3
4021
- cudaDeviceSynchronize 80.85% 9.456ms 80.85% 9.456ms 9.456ms 0.000us 0.00% 0.000us 0.000us 1
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
- Self CPU time total: 11.697ms
4024
- Self CUDA time total: 9.628ms
4025
 
4026
 
4027
  impl wl p50(ms) ok
4028
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4029
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4030
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4031
- hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4032
  </pre></div>
4033
  <div class="uv-install-logs" id="uv-logs-benchmark">
4034
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4035
  <div class="uv-logs-content" style="display: none;">
4036
- Installed 15 packages in 14ms
4037
  </div>
4038
  </div>
4039
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4040
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 6.81it/s]
4041
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.12it/s]
4042
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.56it/s]</div>
4043
  <div class="cell-artifacts">
4044
  <h4>Artifacts:</h4>
4045
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
3872
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3873
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3874
  </span> |
3875
+ Cell: benchmark | 10.03s
3876
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3877
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3878
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
+ hf_kernels_layer_norm 5.27% 208.522us 46.60% 1.845ms 1.845ms 0.000us 0.00% 3.097ms 3.097ms 1
3947
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.74% 68.841us 40.71% 1.611ms 537.108us 2.361ms 100.00% 3.097ms 1.032ms 3
3948
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.06% 2.362ms 2.362ms 1
3949
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.361ms 100.00% 2.361ms 786.869us 3
3950
+ Activity Buffer Request 36.42% 1.442ms 36.42% 1.442ms 1.442ms 736.192us 31.19% 736.192us 736.192us 1
3951
+ aten::view 0.63% 24.853us 0.63% 24.853us 4.142us 0.000us 0.00% 0.000us 0.000us 6
3952
+ aten::empty 1.30% 51.300us 1.30% 51.300us 5.700us 0.000us 0.00% 0.000us 0.000us 9
3953
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 9.370us 0.24% 9.370us 3.123us 0.000us 0.00% 0.000us 0.000us 3
3954
+ cudaLaunchKernel 1.02% 40.192us 1.02% 40.192us 13.397us 0.000us 0.00% 0.000us 0.000us 3
3955
+ cudaDeviceSynchronize 53.40% 2.114ms 53.40% 2.114ms 2.114ms 0.000us 0.00% 0.000us 0.000us 1
3956
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3957
+ Self CPU time total: 3.958ms
3958
+ Self CUDA time total: 2.361ms
3959
 
3960
 
3961
 
 
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
+ hf_kernels_layer_norm 2.10% 132.443us 26.89% 1.698ms 1.698ms 0.000us 0.00% 6.359ms 6.359ms 1
3969
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 47.272us 24.61% 1.554ms 517.847us 4.798ms 100.00% 6.359ms 2.120ms 3
3970
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.799ms 100.03% 4.799ms 4.799ms 1
3971
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.798ms 100.00% 4.798ms 1.599ms 3
3972
+ Activity Buffer Request 22.80% 1.439ms 22.80% 1.439ms 1.439ms 1.561ms 32.53% 1.561ms 1.561ms 1
3973
+ aten::view 0.19% 11.750us 0.19% 11.750us 1.958us 0.000us 0.00% 0.000us 0.000us 6
3974
+ aten::empty 0.50% 31.791us 0.50% 31.791us 3.532us 0.000us 0.00% 0.000us 0.000us 9
3975
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.299us 0.08% 5.299us 1.766us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaLaunchKernel 0.47% 29.920us 0.47% 29.920us 9.973us 0.000us 0.00% 0.000us 0.000us 3
3977
+ cudaDeviceSynchronize 73.11% 4.615ms 73.11% 4.615ms 4.615ms 0.000us 0.00% 0.000us 0.000us 1
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
+ Self CPU time total: 6.313ms
3980
+ Self CUDA time total: 4.798ms
3981
 
3982
 
3983
 
 
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ hf_kernels_layer_norm 1.81% 113.352us 26.62% 1.665ms 1.665ms 0.000us 0.00% 6.298ms 6.298ms 1
3991
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.74% 46.460us 24.63% 1.540ms 513.314us 4.755ms 100.00% 6.298ms 2.099ms 3
3992
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.757ms 100.03% 4.757ms 4.757ms 1
3993
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.755ms 100.00% 4.755ms 1.585ms 3
3994
+ Activity Buffer Request 22.82% 1.427ms 22.82% 1.427ms 1.427ms 1.543ms 32.44% 1.543ms 1.543ms 1
3995
+ aten::view 0.19% 11.631us 0.19% 11.631us 1.939us 0.000us 0.00% 0.000us 0.000us 6
3996
+ aten::empty 0.51% 31.740us 0.51% 31.740us 3.527us 0.000us 0.00% 0.000us 0.000us 9
3997
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.941us 0.08% 4.941us 1.647us 0.000us 0.00% 0.000us 0.000us 3
3998
+ cudaLaunchKernel 0.48% 29.911us 0.48% 29.911us 9.970us 0.000us 0.00% 0.000us 0.000us 3
3999
+ cudaDeviceSynchronize 73.38% 4.589ms 73.38% 4.589ms 4.589ms 0.000us 0.00% 0.000us 0.000us 1
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ Self CPU time total: 6.253ms
4002
+ Self CUDA time total: 4.755ms
4003
 
4004
 
4005
 
 
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4011
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4012
+ hf_kernels_layer_norm 1.13% 113.823us 5.68% 571.343us 571.343us 0.000us 0.00% 12.836ms 12.836ms 1
4013
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.45% 45.540us 4.43% 445.300us 148.433us 9.651ms 100.00% 12.836ms 4.279ms 3
4014
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.652ms 100.01% 9.652ms 9.652ms 1
4015
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.651ms 100.00% 9.651ms 3.217ms 3
4016
+ Activity Buffer Request 1.18% 119.172us 1.18% 119.172us 119.172us 3.185ms 33.00% 3.185ms 3.185ms 1
4017
+ aten::view 0.12% 12.220us 0.12% 12.220us 2.037us 0.000us 0.00% 0.000us 0.000us 6
4018
+ aten::empty 0.31% 31.382us 0.31% 31.382us 3.487us 0.000us 0.00% 0.000us 0.000us 9
4019
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 4.801us 0.05% 4.801us 1.600us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaLaunchKernel 2.43% 244.405us 2.43% 244.405us 81.468us 0.000us 0.00% 0.000us 0.000us 3
4021
+ cudaDeviceSynchronize 94.32% 9.488ms 94.32% 9.488ms 9.488ms 0.000us 0.00% 0.000us 0.000us 1
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
+ Self CPU time total: 10.060ms
4024
+ Self CUDA time total: 9.651ms
4025
 
4026
 
4027
  impl wl p50(ms) ok
4028
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4029
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4030
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4031
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
4032
  </pre></div>
4033
  <div class="uv-install-logs" id="uv-logs-benchmark">
4034
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4035
  <div class="uv-logs-content" style="display: none;">
4036
+ Installed 52 packages in 214ms
4037
  </div>
4038
  </div>
4039
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4040
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.01it/s]
4041
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.02it/s]</div>
 
4042
  <div class="cell-artifacts">
4043
  <h4>Artifacts:</h4>
4044
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/impls/torch_layer_norm.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.23s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 00:36:39 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 33C P0 128W / 350W | 0MiB / 46068MiB | 100% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3918,9 +3918,9 @@ Cell: nv | 0.23s
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 7.38s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3968,19 +3968,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
3968
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3969
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
- torch_layer_norm 3.94% 153.226us 45.99% 1.787ms 1.787ms 0.000us 0.00% 3.036ms 3.036ms 1
3972
- aten::layer_norm 0.41% 15.819us 42.05% 1.634ms 544.665us 0.000us 0.00% 3.036ms 1.012ms 3
3973
- aten::native_layer_norm 2.10% 81.554us 41.64% 1.618ms 539.392us 2.323ms 100.00% 3.036ms 1.012ms 3
3974
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.325ms 100.06% 2.325ms 2.325ms 1
3975
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.323ms 100.00% 2.323ms 774.498us 3
3976
- Activity Buffer Request 36.88% 1.433ms 36.88% 1.433ms 1.433ms 712.322us 30.66% 712.322us 712.322us 1
3977
- aten::empty 1.28% 49.611us 1.28% 49.611us 5.512us 0.000us 0.00% 0.000us 0.000us 9
3978
- cudaLaunchKernel 1.19% 46.322us 1.19% 46.322us 15.441us 0.000us 0.00% 0.000us 0.000us 3
3979
- aten::view 0.19% 7.380us 0.19% 7.380us 1.230us 0.000us 0.00% 0.000us 0.000us 6
3980
- cudaDeviceSynchronize 54.01% 2.099ms 54.01% 2.099ms 2.099ms 0.000us 0.00% 0.000us 0.000us 1
3981
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3982
- Self CPU time total: 3.886ms
3983
- Self CUDA time total: 2.323ms
3984
 
3985
 
3986
 
@@ -3990,19 +3990,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3992
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3993
- torch_layer_norm 1.13% 72.543us 25.40% 1.627ms 1.627ms 0.000us 0.00% 6.533ms 6.533ms 1
3994
- aten::layer_norm 0.14% 8.900us 24.27% 1.554ms 518.074us 0.000us 0.00% 6.533ms 2.178ms 3
3995
- aten::native_layer_norm 0.84% 53.651us 24.13% 1.545ms 515.108us 4.915ms 100.00% 6.533ms 2.178ms 3
3996
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.917ms 100.03% 4.917ms 4.917ms 1
3997
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.915ms 100.00% 4.915ms 1.638ms 3
3998
- Activity Buffer Request 22.32% 1.430ms 22.32% 1.430ms 1.430ms 1.618ms 32.92% 1.618ms 1.618ms 1
3999
- aten::empty 0.44% 28.460us 0.44% 28.460us 3.162us 0.000us 0.00% 0.000us 0.000us 9
4000
- cudaLaunchKernel 0.46% 29.343us 0.46% 29.343us 9.781us 0.000us 0.00% 0.000us 0.000us 3
4001
- aten::view 0.07% 4.330us 0.07% 4.330us 0.722us 0.000us 0.00% 0.000us 0.000us 6
4002
- cudaDeviceSynchronize 74.60% 4.777ms 74.60% 4.777ms 4.777ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
- Self CPU time total: 6.403ms
4005
- Self CUDA time total: 4.915ms
4006
 
4007
 
4008
 
@@ -4012,19 +4012,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
- torch_layer_norm 1.16% 72.353us 26.06% 1.624ms 1.624ms 0.000us 0.00% 6.259ms 6.259ms 1
4016
- aten::layer_norm 0.14% 8.650us 24.90% 1.551ms 517.051us 0.000us 0.00% 6.259ms 2.086ms 3
4017
- aten::native_layer_norm 0.85% 52.692us 24.76% 1.543ms 514.168us 4.742ms 100.00% 6.259ms 2.086ms 3
4018
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.744ms 100.03% 4.744ms 4.744ms 1
4019
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.742ms 100.00% 4.742ms 1.581ms 3
4020
- Activity Buffer Request 22.91% 1.427ms 22.91% 1.427ms 1.427ms 1.517ms 31.99% 1.517ms 1.517ms 1
4021
- aten::empty 0.47% 29.452us 0.47% 29.452us 3.272us 0.000us 0.00% 0.000us 0.000us 9
4022
- cudaLaunchKernel 0.47% 29.331us 0.47% 29.331us 9.777us 0.000us 0.00% 0.000us 0.000us 3
4023
- aten::view 0.06% 4.009us 0.06% 4.009us 0.668us 0.000us 0.00% 0.000us 0.000us 6
4024
- cudaDeviceSynchronize 73.94% 4.606ms 73.94% 4.606ms 4.606ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
- Self CPU time total: 6.229ms
4027
- Self CUDA time total: 4.742ms
4028
 
4029
 
4030
 
@@ -4034,33 +4034,27 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
- torch_layer_norm 0.67% 74.863us 13.13% 1.463ms 1.463ms 0.000us 0.00% 13.036ms 13.036ms 1
4038
- aten::layer_norm 0.09% 9.640us 12.46% 1.388ms 462.622us 0.000us 0.00% 13.036ms 4.345ms 3
4039
- aten::native_layer_norm 0.46% 51.640us 12.37% 1.378ms 459.409us 9.812ms 100.00% 13.036ms 4.345ms 3
4040
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.814ms 100.01% 9.814ms 9.814ms 1
4041
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.812ms 100.00% 9.812ms 3.271ms 3
4042
- Activity Buffer Request 9.60% 1.069ms 9.60% 1.069ms 1.069ms 3.224ms 32.85% 3.224ms 3.224ms 1
4043
- aten::empty 0.26% 29.363us 0.26% 29.363us 3.263us 0.000us 0.00% 0.000us 0.000us 9
4044
- cudaLaunchKernel 2.01% 223.547us 2.01% 223.547us 74.516us 0.000us 0.00% 0.000us 0.000us 3
4045
- aten::view 0.04% 4.180us 0.04% 4.180us 0.697us 0.000us 0.00% 0.000us 0.000us 6
4046
- cudaDeviceSynchronize 86.87% 9.675ms 86.87% 9.675ms 9.675ms 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 11.138ms
4049
- Self CUDA time total: 9.812ms
4050
 
4051
 
4052
  impl wl p50(ms) ok
4053
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4054
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4055
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4056
- torch_layer_norm LN_B16_S4096_D8192 3.32 True
4057
  </pre></div>
4058
- <div class="uv-install-logs" id="uv-logs-benchmark">
4059
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4060
- <div class="uv-logs-content" style="display: none;">
4061
- Installed 37 packages in 222ms
4062
- </div>
4063
- </div>
4064
  <div class="cell-artifacts">
4065
  <h4>Artifacts:</h4>
4066
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Wed Oct 29 04:14:31 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 37C P0 140W / 350W | 0MiB / 46068MiB | 33% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3918
  <span class="collapse-indicators">
3919
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 3.83s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3968
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3969
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
+ torch_layer_norm 3.57% 138.883us 45.78% 1.780ms 1.780ms 0.000us 0.00% 3.022ms 3.022ms 1
3972
+ aten::layer_norm 0.41% 16.121us 42.21% 1.641ms 546.912us 0.000us 0.00% 3.022ms 1.007ms 3
3973
+ aten::native_layer_norm 2.00% 77.621us 41.80% 1.625ms 541.538us 2.315ms 100.00% 3.022ms 1.007ms 3
3974
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.317ms 100.06% 2.317ms 2.317ms 1
3975
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.315ms 100.00% 2.315ms 771.810us 3
3976
+ Activity Buffer Request 37.36% 1.452ms 37.36% 1.452ms 1.452ms 706.306us 30.50% 706.306us 706.306us 1
3977
+ aten::empty 1.15% 44.871us 1.15% 44.871us 4.986us 0.000us 0.00% 0.000us 0.000us 9
3978
+ cudaLaunchKernel 1.10% 42.752us 1.10% 42.752us 14.251us 0.000us 0.00% 0.000us 0.000us 3
3979
+ aten::view 0.19% 7.379us 0.19% 7.379us 1.230us 0.000us 0.00% 0.000us 0.000us 6
3980
+ cudaDeviceSynchronize 54.22% 2.107ms 54.22% 2.107ms 2.107ms 0.000us 0.00% 0.000us 0.000us 1
3981
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3982
+ Self CPU time total: 3.887ms
3983
+ Self CUDA time total: 2.315ms
3984
 
3985
 
3986
 
 
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3992
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3993
+ torch_layer_norm 1.48% 94.373us 25.56% 1.635ms 1.635ms 0.000us 0.00% 6.500ms 6.500ms 1
3994
+ aten::layer_norm 0.15% 9.600us 24.08% 1.541ms 513.581us 0.000us 0.00% 6.500ms 2.167ms 3
3995
+ aten::native_layer_norm 0.84% 53.630us 23.93% 1.531ms 510.381us 4.901ms 100.00% 6.500ms 2.167ms 3
3996
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.903ms 100.03% 4.903ms 4.903ms 1
3997
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.901ms 100.00% 4.901ms 1.634ms 3
3998
+ Activity Buffer Request 22.17% 1.418ms 22.17% 1.418ms 1.418ms 1.599ms 32.62% 1.599ms 1.599ms 1
3999
+ aten::empty 0.44% 28.023us 0.44% 28.023us 3.114us 0.000us 0.00% 0.000us 0.000us 9
4000
+ cudaLaunchKernel 0.43% 27.290us 0.43% 27.290us 9.097us 0.000us 0.00% 0.000us 0.000us 3
4001
+ aten::view 0.06% 3.930us 0.06% 3.930us 0.655us 0.000us 0.00% 0.000us 0.000us 6
4002
+ cudaDeviceSynchronize 74.44% 4.763ms 74.44% 4.763ms 4.763ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
+ Self CPU time total: 6.398ms
4005
+ Self CUDA time total: 4.901ms
4006
 
4007
 
4008
 
 
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
+ torch_layer_norm 1.50% 93.544us 26.20% 1.631ms 1.631ms 0.000us 0.00% 6.249ms 6.249ms 1
4016
+ aten::layer_norm 0.18% 11.099us 24.70% 1.537ms 512.487us 0.000us 0.00% 6.249ms 2.083ms 3
4017
+ aten::native_layer_norm 0.84% 52.492us 24.52% 1.526ms 508.788us 4.730ms 100.00% 6.249ms 2.083ms 3
4018
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.731ms 100.03% 4.731ms 4.731ms 1
4019
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.730ms 100.00% 4.730ms 1.577ms 3
4020
+ Activity Buffer Request 22.71% 1.414ms 22.71% 1.414ms 1.414ms 1.519ms 32.12% 1.519ms 1.519ms 1
4021
+ aten::empty 0.45% 28.140us 0.45% 28.140us 3.127us 0.000us 0.00% 0.000us 0.000us 9
4022
+ cudaLaunchKernel 0.45% 28.230us 0.45% 28.230us 9.410us 0.000us 0.00% 0.000us 0.000us 3
4023
+ aten::view 0.06% 3.950us 0.06% 3.950us 0.658us 0.000us 0.00% 0.000us 0.000us 6
4024
+ cudaDeviceSynchronize 73.80% 4.594ms 73.80% 4.594ms 4.594ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
+ Self CPU time total: 6.225ms
4027
+ Self CUDA time total: 4.730ms
4028
 
4029
 
4030
 
 
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
+ torch_layer_norm 0.92% 106.312us 16.40% 1.902ms 1.902ms 0.000us 0.00% 13.074ms 13.074ms 1
4038
+ aten::layer_norm 0.08% 9.291us 15.49% 1.795ms 598.420us 0.000us 0.00% 13.074ms 4.358ms 3
4039
+ aten::native_layer_norm 0.48% 55.080us 15.41% 1.786ms 595.323us 9.836ms 100.00% 13.074ms 4.358ms 3
4040
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.838ms 100.01% 9.838ms 9.838ms 1
4041
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.836ms 100.00% 9.836ms 3.279ms 3
4042
+ Activity Buffer Request 12.48% 1.446ms 12.48% 1.446ms 1.446ms 3.238ms 32.91% 3.238ms 3.238ms 1
4043
+ aten::empty 0.25% 29.330us 0.25% 29.330us 3.259us 0.000us 0.00% 0.000us 0.000us 9
4044
+ cudaLaunchKernel 2.17% 251.116us 2.17% 251.116us 83.705us 0.000us 0.00% 0.000us 0.000us 3
4045
+ aten::view 0.03% 3.981us 0.03% 3.981us 0.663us 0.000us 0.00% 0.000us 0.000us 6
4046
+ cudaDeviceSynchronize 83.60% 9.692ms 83.60% 9.692ms 9.692ms 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 11.593ms
4049
+ Self CUDA time total: 9.836ms
4050
 
4051
 
4052
  impl wl p50(ms) ok
4053
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4054
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4055
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4056
+ torch_layer_norm LN_B16_S4096_D8192 3.33 True
4057
  </pre></div>
 
 
 
 
 
 
4058
  <div class="cell-artifacts">
4059
  <h4>Artifacts:</h4>
4060
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/results/artifacts/combine/latency.svg CHANGED
layer_norm/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T00:37:29.280510</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3956,70 +3956,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3956
  <g id="matplotlib.axis_2">
3957
  <g id="ytick_1">
3958
  <g id="grid-y--2" class="grid grid-y">
3959
- <path d="M 47.72 409.271209 L 840.20233 409.271209 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3960
  </g>
3961
  <g id="line2d_5">
3962
  <defs>
3963
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3964
  </defs>
3965
  <g>
3966
- <use ns4:href="#m0fca2865ba" x="47.72" y="409.271209" style="stroke: #000000; stroke-width: 0.8" />
3967
  </g>
3968
  </g>
3969
  <g id="text_5">
3970
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.070428" transform="rotate(-0 40.72 413.070428)">1.0</text>
3971
  </g>
3972
  </g>
3973
  <g id="ytick_2">
3974
  <g id="grid-y--3" class="grid grid-y">
3975
- <path d="M 47.72 331.23823 L 840.20233 331.23823 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3976
  </g>
3977
  <g id="line2d_6">
3978
  <g>
3979
- <use ns4:href="#m0fca2865ba" x="47.72" y="331.23823" style="stroke: #000000; stroke-width: 0.8" />
3980
  </g>
3981
  </g>
3982
  <g id="text_6">
3983
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.037449" transform="rotate(-0 40.72 335.037449)">1.5</text>
3984
  </g>
3985
  </g>
3986
  <g id="ytick_3">
3987
  <g id="grid-y--4" class="grid grid-y">
3988
- <path d="M 47.72 253.205252 L 840.20233 253.205252 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3989
  </g>
3990
  <g id="line2d_7">
3991
  <g>
3992
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.205252" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.00447" transform="rotate(-0 40.72 257.00447)">2.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_4">
4000
  <g id="grid-y--5" class="grid grid-y">
4001
- <path d="M 47.72 175.172273 L 840.20233 175.172273 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.172273" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.971492" transform="rotate(-0 40.72 178.971492)">2.5</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_5">
4013
  <g id="grid-y--6" class="grid grid-y">
4014
- <path d="M 47.72 97.139294 L 840.20233 97.139294 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
- <use ns4:href="#m0fca2865ba" x="47.72" y="97.139294" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.938513" transform="rotate(-0 40.72 100.938513)">3.0</text>
4023
  </g>
4024
  </g>
4025
  <g id="label--y" class="ylabel">
@@ -4027,27 +4027,27 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4027
  </g>
4028
  </g>
4029
  <g id="series--torch-layer-norm" class="series">
4030
- <path d="M 83.741924 437.689571 L 323.888085 302.828296 L 564.034245 313.634771 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4031
  <defs>
4032
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4033
  </defs>
4034
  <g clip-path="url(#p2214f54723)">
4035
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4036
- <use ns4:href="#md7efaf3aec" x="323.888085" y="302.828296" style="fill: #1f77b4; stroke: #1f77b4" />
4037
- <use ns4:href="#md7efaf3aec" x="564.034245" y="313.634771" style="fill: #1f77b4; stroke: #1f77b4" />
4038
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4039
  </g>
4040
  </g>
4041
  <g id="series--hf-kernels-layer-norm" class="series">
4042
- <path d="M 83.741924 435.882172 L 323.888085 306.994008 L 564.034245 308.141093 L 804.180406 56.875682 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4043
  <defs>
4044
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4045
  </defs>
4046
  <g clip-path="url(#p2214f54723)">
4047
- <use ns4:href="#m9b8c54d372" x="83.741924" y="435.882172" style="fill: #ff7f0e; stroke: #ff7f0e" />
4048
- <use ns4:href="#m9b8c54d372" x="323.888085" y="306.994008" style="fill: #ff7f0e; stroke: #ff7f0e" />
4049
- <use ns4:href="#m9b8c54d372" x="564.034245" y="308.141093" style="fill: #ff7f0e; stroke: #ff7f0e" />
4050
- <use ns4:href="#m9b8c54d372" x="804.180406" y="56.875682" style="fill: #ff7f0e; stroke: #ff7f0e" />
4051
  </g>
4052
  </g>
4053
  <g id="patch_3">
@@ -4105,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4105
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4106
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4107
  </span> |
4108
- Cell: combine | 4.26s
4109
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4110
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4111
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4193,13 +4193,13 @@ COMBINED BENCHMARK SUMMARY
4193
 
4194
  impl wl p50(ms) ok
4195
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4196
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
4197
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4198
- hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4199
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4200
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4201
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4202
- torch_layer_norm LN_B16_S4096_D8192 3.32 True
4203
 
4204
  GENERATING COMBINED VISUALIZATION
4205
 
@@ -4219,7 +4219,7 @@ Implementations included:
4219
  <div class="uv-install-logs" id="uv-logs-combine">
4220
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4221
  <div class="uv-logs-content" style="display: none;">
4222
- Installed 37 packages in 195ms
4223
  </div>
4224
  </div>
4225
  <div class="cell-artifacts">
@@ -4232,7 +4232,7 @@ Installed 37 packages in 195ms
4232
  <rdf:RDF>
4233
  <ns2:Work>
4234
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4235
- <dc:date>2025-10-29T00:37:29.280510</dc:date>
4236
  <dc:format>image/svg+xml</dc:format>
4237
  <dc:creator>
4238
  <ns2:Agent>
@@ -4316,70 +4316,70 @@ Installed 37 packages in 195ms
4316
  <g id="matplotlib.axis_2">
4317
  <g id="ytick_1">
4318
  <g id="grid-y--2" class="grid grid-y">
4319
- <path d="M 47.72 409.271209 L 840.20233 409.271209 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4320
  </g>
4321
  <g id="line2d_5">
4322
  <defs>
4323
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4324
  </defs>
4325
  <g>
4326
- <use ns4:href="#m0fca2865ba" x="47.72" y="409.271209" style="stroke: #000000; stroke-width: 0.8" />
4327
  </g>
4328
  </g>
4329
  <g id="text_5">
4330
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.070428" transform="rotate(-0 40.72 413.070428)">1.0</text>
4331
  </g>
4332
  </g>
4333
  <g id="ytick_2">
4334
  <g id="grid-y--3" class="grid grid-y">
4335
- <path d="M 47.72 331.23823 L 840.20233 331.23823 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4336
  </g>
4337
  <g id="line2d_6">
4338
  <g>
4339
- <use ns4:href="#m0fca2865ba" x="47.72" y="331.23823" style="stroke: #000000; stroke-width: 0.8" />
4340
  </g>
4341
  </g>
4342
  <g id="text_6">
4343
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.037449" transform="rotate(-0 40.72 335.037449)">1.5</text>
4344
  </g>
4345
  </g>
4346
  <g id="ytick_3">
4347
  <g id="grid-y--4" class="grid grid-y">
4348
- <path d="M 47.72 253.205252 L 840.20233 253.205252 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4349
  </g>
4350
  <g id="line2d_7">
4351
  <g>
4352
- <use ns4:href="#m0fca2865ba" x="47.72" y="253.205252" style="stroke: #000000; stroke-width: 0.8" />
4353
  </g>
4354
  </g>
4355
  <g id="text_7">
4356
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.00447" transform="rotate(-0 40.72 257.00447)">2.0</text>
4357
  </g>
4358
  </g>
4359
  <g id="ytick_4">
4360
  <g id="grid-y--5" class="grid grid-y">
4361
- <path d="M 47.72 175.172273 L 840.20233 175.172273 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4362
  </g>
4363
  <g id="line2d_8">
4364
  <g>
4365
- <use ns4:href="#m0fca2865ba" x="47.72" y="175.172273" style="stroke: #000000; stroke-width: 0.8" />
4366
  </g>
4367
  </g>
4368
  <g id="text_8">
4369
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.971492" transform="rotate(-0 40.72 178.971492)">2.5</text>
4370
  </g>
4371
  </g>
4372
  <g id="ytick_5">
4373
  <g id="grid-y--6" class="grid grid-y">
4374
- <path d="M 47.72 97.139294 L 840.20233 97.139294 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4375
  </g>
4376
  <g id="line2d_9">
4377
  <g>
4378
- <use ns4:href="#m0fca2865ba" x="47.72" y="97.139294" style="stroke: #000000; stroke-width: 0.8" />
4379
  </g>
4380
  </g>
4381
  <g id="text_9">
4382
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.938513" transform="rotate(-0 40.72 100.938513)">3.0</text>
4383
  </g>
4384
  </g>
4385
  <g id="label--y" class="ylabel">
@@ -4387,27 +4387,27 @@ Installed 37 packages in 195ms
4387
  </g>
4388
  </g>
4389
  <g id="series--torch-layer-norm" class="series">
4390
- <path d="M 83.741924 437.689571 L 323.888085 302.828296 L 564.034245 313.634771 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4391
  <defs>
4392
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4393
  </defs>
4394
  <g clip-path="url(#p2214f54723)">
4395
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4396
- <use ns4:href="#md7efaf3aec" x="323.888085" y="302.828296" style="fill: #1f77b4; stroke: #1f77b4" />
4397
- <use ns4:href="#md7efaf3aec" x="564.034245" y="313.634771" style="fill: #1f77b4; stroke: #1f77b4" />
4398
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4399
  </g>
4400
  </g>
4401
  <g id="series--hf-kernels-layer-norm" class="series">
4402
- <path d="M 83.741924 435.882172 L 323.888085 306.994008 L 564.034245 308.141093 L 804.180406 56.875682 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4403
  <defs>
4404
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4405
  </defs>
4406
  <g clip-path="url(#p2214f54723)">
4407
- <use ns4:href="#m9b8c54d372" x="83.741924" y="435.882172" style="fill: #ff7f0e; stroke: #ff7f0e" />
4408
- <use ns4:href="#m9b8c54d372" x="323.888085" y="306.994008" style="fill: #ff7f0e; stroke: #ff7f0e" />
4409
- <use ns4:href="#m9b8c54d372" x="564.034245" y="308.141093" style="fill: #ff7f0e; stroke: #ff7f0e" />
4410
- <use ns4:href="#m9b8c54d372" x="804.180406" y="56.875682" style="fill: #ff7f0e; stroke: #ff7f0e" />
4411
  </g>
4412
  </g>
4413
  <g id="patch_3">
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T04:14:58.377658</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
3956
  <g id="matplotlib.axis_2">
3957
  <g id="ytick_1">
3958
  <g id="grid-y--2" class="grid grid-y">
3959
+ <path d="M 47.72 409.499615 L 840.20233 409.499615 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3960
  </g>
3961
  <g id="line2d_5">
3962
  <defs>
3963
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3964
  </defs>
3965
  <g>
3966
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.499615" style="stroke: #000000; stroke-width: 0.8" />
3967
  </g>
3968
  </g>
3969
  <g id="text_5">
3970
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.298834" transform="rotate(-0 40.72 413.298834)">1.0</text>
3971
  </g>
3972
  </g>
3973
  <g id="ytick_2">
3974
  <g id="grid-y--3" class="grid grid-y">
3975
+ <path d="M 47.72 331.752234 L 840.20233 331.752234 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3976
  </g>
3977
  <g id="line2d_6">
3978
  <g>
3979
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.752234" style="stroke: #000000; stroke-width: 0.8" />
3980
  </g>
3981
  </g>
3982
  <g id="text_6">
3983
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.551453" transform="rotate(-0 40.72 335.551453)">1.5</text>
3984
  </g>
3985
  </g>
3986
  <g id="ytick_3">
3987
  <g id="grid-y--4" class="grid grid-y">
3988
+ <path d="M 47.72 254.004854 L 840.20233 254.004854 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3989
  </g>
3990
  <g id="line2d_7">
3991
  <g>
3992
+ <use ns4:href="#m0fca2865ba" x="47.72" y="254.004854" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.804072" transform="rotate(-0 40.72 257.804072)">2.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_4">
4000
  <g id="grid-y--5" class="grid grid-y">
4001
+ <path d="M 47.72 176.257473 L 840.20233 176.257473 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
+ <use ns4:href="#m0fca2865ba" x="47.72" y="176.257473" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="180.056692" transform="rotate(-0 40.72 180.056692)">2.5</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_5">
4013
  <g id="grid-y--6" class="grid grid-y">
4014
+ <path d="M 47.72 98.510092 L 840.20233 98.510092 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
+ <use ns4:href="#m0fca2865ba" x="47.72" y="98.510092" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="102.309311" transform="rotate(-0 40.72 102.309311)">3.0</text>
4023
  </g>
4024
  </g>
4025
  <g id="label--y" class="ylabel">
 
4027
  </g>
4028
  </g>
4029
  <g id="series--torch-layer-norm" class="series">
4030
+ <path d="M 83.741924 437.689571 L 323.888085 303.46214 L 564.034245 314.281776 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4031
  <defs>
4032
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4033
  </defs>
4034
  <g clip-path="url(#p2214f54723)">
4035
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4036
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.46214" style="fill: #1f77b4; stroke: #1f77b4" />
4037
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.281776" style="fill: #1f77b4; stroke: #1f77b4" />
4038
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4039
  </g>
4040
  </g>
4041
  <g id="series--hf-kernels-layer-norm" class="series">
4042
+ <path d="M 83.741924 435.38374 L 323.888085 308.428798 L 564.034245 308.2251 L 804.180406 56.574866 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4043
  <defs>
4044
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4045
  </defs>
4046
  <g clip-path="url(#p2214f54723)">
4047
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="435.38374" style="fill: #ff7f0e; stroke: #ff7f0e" />
4048
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="308.428798" style="fill: #ff7f0e; stroke: #ff7f0e" />
4049
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="308.2251" style="fill: #ff7f0e; stroke: #ff7f0e" />
4050
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.574866" style="fill: #ff7f0e; stroke: #ff7f0e" />
4051
  </g>
4052
  </g>
4053
  <g id="patch_3">
 
4105
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4106
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4107
  </span> |
4108
+ Cell: combine | 4.24s
4109
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4110
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4111
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4193
 
4194
  impl wl p50(ms) ok
4195
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4196
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4197
  hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4198
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
4199
  torch_layer_norm LN_B16_S2048_D4096 0.82 True
4200
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4201
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4202
+ torch_layer_norm LN_B16_S4096_D8192 3.33 True
4203
 
4204
  GENERATING COMBINED VISUALIZATION
4205
 
 
4219
  <div class="uv-install-logs" id="uv-logs-combine">
4220
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4221
  <div class="uv-logs-content" style="display: none;">
4222
+ Installed 37 packages in 227ms
4223
  </div>
4224
  </div>
4225
  <div class="cell-artifacts">
 
4232
  <rdf:RDF>
4233
  <ns2:Work>
4234
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4235
+ <dc:date>2025-10-29T04:14:58.377658</dc:date>
4236
  <dc:format>image/svg+xml</dc:format>
4237
  <dc:creator>
4238
  <ns2:Agent>
 
4316
  <g id="matplotlib.axis_2">
4317
  <g id="ytick_1">
4318
  <g id="grid-y--2" class="grid grid-y">
4319
+ <path d="M 47.72 409.499615 L 840.20233 409.499615 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4320
  </g>
4321
  <g id="line2d_5">
4322
  <defs>
4323
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4324
  </defs>
4325
  <g>
4326
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.499615" style="stroke: #000000; stroke-width: 0.8" />
4327
  </g>
4328
  </g>
4329
  <g id="text_5">
4330
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.298834" transform="rotate(-0 40.72 413.298834)">1.0</text>
4331
  </g>
4332
  </g>
4333
  <g id="ytick_2">
4334
  <g id="grid-y--3" class="grid grid-y">
4335
+ <path d="M 47.72 331.752234 L 840.20233 331.752234 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4336
  </g>
4337
  <g id="line2d_6">
4338
  <g>
4339
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.752234" style="stroke: #000000; stroke-width: 0.8" />
4340
  </g>
4341
  </g>
4342
  <g id="text_6">
4343
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.551453" transform="rotate(-0 40.72 335.551453)">1.5</text>
4344
  </g>
4345
  </g>
4346
  <g id="ytick_3">
4347
  <g id="grid-y--4" class="grid grid-y">
4348
+ <path d="M 47.72 254.004854 L 840.20233 254.004854 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4349
  </g>
4350
  <g id="line2d_7">
4351
  <g>
4352
+ <use ns4:href="#m0fca2865ba" x="47.72" y="254.004854" style="stroke: #000000; stroke-width: 0.8" />
4353
  </g>
4354
  </g>
4355
  <g id="text_7">
4356
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.804072" transform="rotate(-0 40.72 257.804072)">2.0</text>
4357
  </g>
4358
  </g>
4359
  <g id="ytick_4">
4360
  <g id="grid-y--5" class="grid grid-y">
4361
+ <path d="M 47.72 176.257473 L 840.20233 176.257473 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4362
  </g>
4363
  <g id="line2d_8">
4364
  <g>
4365
+ <use ns4:href="#m0fca2865ba" x="47.72" y="176.257473" style="stroke: #000000; stroke-width: 0.8" />
4366
  </g>
4367
  </g>
4368
  <g id="text_8">
4369
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="180.056692" transform="rotate(-0 40.72 180.056692)">2.5</text>
4370
  </g>
4371
  </g>
4372
  <g id="ytick_5">
4373
  <g id="grid-y--6" class="grid grid-y">
4374
+ <path d="M 47.72 98.510092 L 840.20233 98.510092 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4375
  </g>
4376
  <g id="line2d_9">
4377
  <g>
4378
+ <use ns4:href="#m0fca2865ba" x="47.72" y="98.510092" style="stroke: #000000; stroke-width: 0.8" />
4379
  </g>
4380
  </g>
4381
  <g id="text_9">
4382
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="102.309311" transform="rotate(-0 40.72 102.309311)">3.0</text>
4383
  </g>
4384
  </g>
4385
  <g id="label--y" class="ylabel">
 
4387
  </g>
4388
  </g>
4389
  <g id="series--torch-layer-norm" class="series">
4390
+ <path d="M 83.741924 437.689571 L 323.888085 303.46214 L 564.034245 314.281776 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4391
  <defs>
4392
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4393
  </defs>
4394
  <g clip-path="url(#p2214f54723)">
4395
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4396
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.46214" style="fill: #1f77b4; stroke: #1f77b4" />
4397
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.281776" style="fill: #1f77b4; stroke: #1f77b4" />
4398
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4399
  </g>
4400
  </g>
4401
  <g id="series--hf-kernels-layer-norm" class="series">
4402
+ <path d="M 83.741924 435.38374 L 323.888085 308.428798 L 564.034245 308.2251 L 804.180406 56.574866 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4403
  <defs>
4404
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4405
  </defs>
4406
  <g clip-path="url(#p2214f54723)">
4407
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="435.38374" style="fill: #ff7f0e; stroke: #ff7f0e" />
4408
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="308.428798" style="fill: #ff7f0e; stroke: #ff7f0e" />
4409
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="308.2251" style="fill: #ff7f0e; stroke: #ff7f0e" />
4410
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.574866" style="fill: #ff7f0e; stroke: #ff7f0e" />
4411
  </g>
4412
  </g>
4413
  <g id="patch_3">
rotary/impls/artifacts/benchmark/rotary.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07538300002352116, "p50": 0.07777199999736695, "p90": 0.07795200002647107, "mean": 0.07717860000866494, "iqr": 0.0014790000477660215, "raw_times": [0.07777199999736695, 0.07647299997870505, 0.07795200002647107, 0.07831300001726049, 0.07538300002352116], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0837029999729566, "peak_bytes": 1720320, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.00153350830078125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
2
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09504299998752685, "p50": 0.09633299998768052, "p90": 0.09746300003143915, "mean": 0.0966769999877215, "iqr": 0.0013000000649299182, "raw_times": [0.09504299998752685, 0.09633299998768052, 0.09838299996545175, 0.09616299996650923, 0.09746300003143915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09918300003164404, "peak_bytes": 3440640, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.00154876708984375, "mse_q": 1.5854835510253906e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
3
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0929430000269349, "p50": 0.09560399996644264, "p90": 0.09620299999824056, "mean": 0.09600920000139013, "iqr": 0.0026899999738816405, "raw_times": [0.09620299999824056, 0.09560399996644264, 0.10178299999097362, 0.09351300002435892, 0.0929430000269349], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10062299998025992, "peak_bytes": 6832128, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
4
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09350300001642609, "p50": 0.09415400000989393, "p90": 0.09585299994796515, "mean": 0.09842139999136634, "iqr": 0.001959999963219161, "raw_times": [0.09350300001642609, 0.09585299994796515, 0.09415400000989393, 0.11470399999780057, 0.09389299998474598], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09742299999970783, "peak_bytes": 13664256, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
5
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09248300000308518, "p50": 0.09347299999262759, "p90": 0.09500300001263895, "mean": 0.09405499998820233, "iqr": 0.0018000000636675395, "raw_times": [0.09248300000308518, 0.09500300001263895, 0.0961129999836885, 0.09347299999262759, 0.09320299994897141], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09855400003289105, "peak_bytes": 6881280, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
6
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09233299999777955, "p50": 0.09477300000071409, "p90": 0.09477400004698211, "mean": 0.09424540002100912, "iqr": 0.0021910000214120373, "raw_times": [0.09233299999777955, 0.09477400004698211, 0.09477300000071409, 0.09676400003399976, 0.09258300002557007], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09677399998508918, "peak_bytes": 13762560, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
7
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09216400002287628, "p50": 0.09306300000844203, "p90": 0.09349300000849325, "mean": 0.09324520001428027, "iqr": 0.0005399999736255268, "raw_times": [0.09216400002287628, 0.09306300000844203, 0.09455299999672206, 0.09349300000849325, 0.09295300003486773], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10914400002093316, "peak_bytes": 27328512, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.00153350830078125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.5854835510253906e-05, "ref": "rotary_torch"}, "err": null}
8
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09248299994624176, "p50": 0.09334300000318763, "p90": 0.09355399998867142, "mean": 0.0935691999870869, "iqr": 0.00066100000140068, "raw_times": [0.09355399998867142, 0.09557300001006297, 0.09334300000318763, 0.09248299994624176, 0.09289299998727074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09584299999687573, "peak_bytes": 54657024, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.00154876708984375, "mse_q": 1.621246337890625e-05, "mse_k": 1.621246337890625e-05, "ref": "rotary_torch"}, "err": null}
9
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09247299999515235, "p50": 0.09385300000985808, "p90": 0.09445400002050519, "mean": 0.09405140001490508, "iqr": 0.001121000025250396, "raw_times": [0.09247299999515235, 0.09445400002050519, 0.0933329999952548, 0.09385300000985808, 0.09614400005375501], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09844400000247333, "peak_bytes": 27525120, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
10
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09372300002041811, "p50": 0.094173000036335, "p90": 0.09575299998232367, "mean": 0.09506720000445057, "iqr": 0.0020299999619055598, "raw_times": [0.09796399996275795, 0.09575299998232367, 0.094173000036335, 0.09372300002041811, 0.09372300002041811], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09865399999853253, "peak_bytes": 55050240, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
11
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09140299999899071, "p50": 0.092913999992561, "p90": 0.09422299996231231, "mean": 0.09330119999049202, "iqr": 0.0015199999552351073, "raw_times": [0.09140299999899071, 0.09526299999151888, 0.092913999992561, 0.09422299996231231, 0.09270300000707721], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09514300001001175, "peak_bytes": 109314048, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
12
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09479400000600435, "p50": 0.09623299996519563, "p90": 0.09679300001153024, "mean": 0.09610519999796452, "iqr": 0.000919999990856013, "raw_times": [0.09587300002067423, 0.09679300001153024, 0.09479400000600435, 0.09623299996519563, 0.09683299998641814], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09740300004068558, "peak_bytes": 218628096, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
13
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09216300003345168, "p50": 0.09397300004820863, "p90": 0.09462299999540846, "mean": 0.09381320001011773, "iqr": 0.0016889999869817984, "raw_times": [0.09293400000842666, 0.09537299996509319, 0.09397300004820863, 0.09216300003345168, 0.09462299999540846], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10023300001194002, "peak_bytes": 68698112, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
14
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0913630000241028, "p50": 0.0930929999753971, "p90": 0.09448299999803567, "mean": 0.09361499999158696, "iqr": 0.0023500000452258973, "raw_times": [0.0913630000241028, 0.09700300000758943, 0.09213299995280977, 0.09448299999803567, 0.0930929999753971], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09703300003138793, "peak_bytes": 6848512, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.5974044799804688e-05, "ref": "rotary_torch"}, "err": null}
15
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0902330000371876, "p50": 0.09208300002683245, "p90": 0.0927039999965018, "mean": 0.0920254000220666, "iqr": 0.0007599999776175537, "raw_times": [0.0902330000371876, 0.09194400001888425, 0.09208300002683245, 0.09316300003092692, 0.0927039999965018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09501400000999638, "peak_bytes": 13647872, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
16
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09339300004285178, "p50": 0.09388300003365657, "p90": 0.09438299997555077, "mean": 0.09392300001991316, "iqr": 0.0009499999578110874, "raw_times": [0.09388300003365657, 0.09452300002976699, 0.09438299997555077, 0.09339300004285178, 0.09343300001773969], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09746399996402033, "peak_bytes": 27295744, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.621246337890625e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
17
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09369299999661962, "p50": 0.09495300002981821, "p90": 0.09641299999429975, "mean": 0.09557120000636132, "iqr": 0.001839999981712026, "raw_times": [0.09457300001258773, 0.09495300002981821, 0.09641299999429975, 0.09369299999661962, 0.0982239999984813], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09584299999687573, "peak_bytes": 13697024, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.0015411376953125, "mse_q": 1.5974044799804688e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
18
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09207300001889962, "p50": 0.09441299999934927, "p90": 0.09493300001395255, "mean": 0.09826719999637135, "iqr": 0.0009000000318337698, "raw_times": [0.09207300001889962, 0.11588399996753651, 0.09441299999934927, 0.09493300001395255, 0.09403299998211878], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09803300002886317, "peak_bytes": 27394048, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00153350830078125, "mae_k": 0.00153350830078125, "mse_q": 1.5854835510253906e-05, "mse_k": 1.5974044799804688e-05, "ref": "rotary_torch"}, "err": null}
19
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09320300000581483, "p50": 0.09509299997034759, "p90": 0.0968430000511944, "mean": 0.0957752000090295, "iqr": 0.0027100000465907215, "raw_times": [0.0968430000511944, 0.09413300000460367, 0.09509299997034759, 0.09960400001318703, 0.09320300000581483], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09855399997604763, "peak_bytes": 54591488, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
20
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0926630000321893, "p50": 0.09438299997555077, "p90": 0.09443299995837151, "mean": 0.09837319998950989, "iqr": 0.0016799999684735667, "raw_times": [0.09275299998989794, 0.09438299997555077, 0.09443299995837151, 0.0926630000321893, 0.1176339999915399], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09754300003805838, "peak_bytes": 109182976, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
21
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09100299996589456, "p50": 0.09359299997413473, "p90": 0.09518299998489965, "mean": 0.09356119999210932, "iqr": 0.0025699999355310865, "raw_times": [0.09100299996589456, 0.09518299998489965, 0.09261300004936857, 0.09541399998624911, 0.09359299997413473], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.11789399997041983, "peak_bytes": 54788096, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.125, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
22
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09348399998998502, "p50": 0.09433299999273004, "p90": 0.09580299996514441, "mean": 0.09473540000044522, "iqr": 0.0016299999288094114, "raw_times": [0.09433299999273004, 0.09580299996514441, 0.09588400001803166, 0.09348399998998502, 0.094173000036335], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09657300000753821, "peak_bytes": 109576192, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.00154876708984375, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
23
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0974529999666629, "p50": 0.09860399995886837, "p90": 0.09875400002101742, "mean": 0.09851759998582565, "iqr": 0.0008510000384376326, "raw_times": [0.09790299998257979, 0.0974529999666629, 0.09860399995886837, 0.09875400002101742, 0.0998739999999998], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10540400000991212, "peak_bytes": 218365952, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.0625, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
24
- {"ts": "2025-10-29T00:36:59Z", "run": "2046381e300e4ff697cc6e6e12460492", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2809499999898435, "p50": 0.28135000002293964, "p90": 0.2840199999809556, "mean": 0.28239179999900443, "iqr": 0.0029809999659846653, "raw_times": [0.2809499999898435, 0.28459999998631247, 0.2840199999809556, 0.28103900001497095, 0.28135000002293964], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.28416999998626125, "peak_bytes": 436731904, "ok": false, "absmax": null, "corr": {"ok": false, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0625, "absmax_k": 0.125, "mae_q": 0.0015411376953125, "mae_k": 0.0015411376953125, "mse_q": 1.609325408935547e-05, "mse_k": 1.609325408935547e-05, "ref": "rotary_torch"}, "err": null}
 
1
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.17578399996409644, "p50": 0.17709399998011577, "p90": 0.17922400002134964, "mean": 0.17895179998959065, "iqr": 0.002560000041285093, "raw_times": [0.17666399998006455, 0.17709399998011577, 0.17578399996409644, 0.18599300000232688, 0.17922400002134964], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.18588400001817718, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
2
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21852399999033878, "p50": 0.22123499996951068, "p90": 0.22281499997234278, "mean": 0.22667299998602175, "iqr": 0.0019999999949504854, "raw_times": [0.2208149999773923, 0.22123499996951068, 0.21852399999033878, 0.24997600002052422, 0.22281499997234278], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22185399996033084, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
3
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21726399995714019, "p50": 0.22378500000286294, "p90": 0.22635499999523745, "mean": 0.22464679999529835, "iqr": 0.0036000000136482413, "raw_times": [0.22635499999523745, 0.22378500000286294, 0.21726399995714019, 0.23307500003966197, 0.2227549999815892], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22023499997203544, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
4
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21678499996369283, "p50": 0.2199049999944691, "p90": 0.22050500001569162, "mean": 0.21960279999575505, "iqr": 0.0022199999989425123, "raw_times": [0.22050500001569162, 0.2199049999944691, 0.2182850000167491, 0.22253399998817258, 0.21678499996369283], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23155500002758345, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
5
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21946399999706045, "p50": 0.22019400000772293, "p90": 0.22058499996546743, "mean": 0.2201885999852493, "iqr": 0.0005499999815583578, "raw_times": [0.22019400000772293, 0.22003499998390907, 0.22058499996546743, 0.21946399999706045, 0.22066499997208666], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22136500001579407, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
6
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.215785000023061, "p50": 0.21925499999042586, "p90": 0.22044500002493805, "mean": 0.22373300000708696, "iqr": 0.003450000008342613, "raw_times": [0.215785000023061, 0.22044500002493805, 0.21925499999042586, 0.24618499998041443, 0.21699500001659544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22207500001059088, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
7
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21516500004281625, "p50": 0.2195149999693058, "p90": 0.22164500001053966, "mean": 0.21926680001342902, "iqr": 0.005660999988776894, "raw_times": [0.22402500002272063, 0.2195149999693058, 0.21516500004281625, 0.21598400002176277, 0.22164500001053966], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21819500000219705, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
8
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2141339999752745, "p50": 0.218735000032666, "p90": 0.21932399999968766, "mean": 0.22093040000754627, "iqr": 0.0017599999750927964, "raw_times": [0.2141339999752745, 0.23489500000550834, 0.21756400002459486, 0.218735000032666, 0.21932399999968766], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22086500001705645, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
9
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21650500002579065, "p50": 0.21868400000357724, "p90": 0.21925499999042586, "mean": 0.22372079999968264, "iqr": 0.0009299999987888441, "raw_times": [0.21650500002579065, 0.24583499998698244, 0.21925499999042586, 0.21832499999163701, 0.21868400000357724], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22242500000402288, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
10
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2184950000128083, "p50": 0.22323500002130459, "p90": 0.22841400004836032, "mean": 0.22448680001616594, "iqr": 0.008849000039390376, "raw_times": [0.22841400004836032, 0.22323500002130459, 0.2184950000128083, 0.21956500000896995, 0.23272499998938656], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22242500000402288, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
11
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21709500003908033, "p50": 0.22114500001180204, "p90": 0.22174499997618113, "mean": 0.22064500001306442, "iqr": 0.004549999971459329, "raw_times": [0.22174499997618113, 0.22114500001180204, 0.22604500003353678, 0.21709500003908033, 0.2171950000047218], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2633260000379778, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
12
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2232749999961925, "p50": 0.22420499999498134, "p90": 0.225494999995135, "mean": 0.22680499999978565, "iqr": 0.001499999996212864, "raw_times": [0.23705500001369728, 0.22399499999892214, 0.225494999995135, 0.22420499999498134, 0.2232749999961925], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22469399999636153, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
13
+ {"ts": "2025-10-29T04:14:38Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21559499998602405, "p50": 0.2172250000285203, "p90": 0.21947499999441789, "mean": 0.2188926000030733, "iqr": 0.002500999983112706, "raw_times": [0.21559499998602405, 0.22519399999509915, 0.21697400001130518, 0.21947499999441789, 0.2172250000285203], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2516950000313045, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
14
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21588499998870248, "p50": 0.2187050000088675, "p90": 0.2197649999970963, "mean": 0.22497519998978532, "iqr": 0.0018900000213761814, "raw_times": [0.21787499997572013, 0.2526459999785402, 0.21588499998870248, 0.2187050000088675, 0.2197649999970963], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22343400002000635, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
15
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21829500002468194, "p50": 0.2230250000252454, "p90": 0.2236250000464679, "mean": 0.22705700001779405, "iqr": 0.0044400000547284435, "raw_times": [0.21829500002468194, 0.25115500000083557, 0.21918499999173946, 0.2230250000252454, 0.2236250000464679], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22709500001383276, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
16
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2149849999568687, "p50": 0.21960500004070127, "p90": 0.22131500003297333, "mean": 0.22512300001835683, "iqr": 0.0024400000029345392, "raw_times": [0.21960500004070127, 0.22131500003297333, 0.2149849999568687, 0.2188750000300388, 0.25083500003120207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22256500000139567, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
17
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21324499999764157, "p50": 0.21695399999543952, "p90": 0.22048499999982596, "mean": 0.21762459998626582, "iqr": 0.003631000026871334, "raw_times": [0.21685399997295463, 0.22058499996546743, 0.21695399999543952, 0.22048499999982596, 0.21324499999764157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2328750000515356, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
18
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21507499997142077, "p50": 0.21520500001770415, "p90": 0.21626500000593296, "mean": 0.2183889999969324, "iqr": 0.001121000025250396, "raw_times": [0.21626500000593296, 0.21507499997142077, 0.21520500001770415, 0.23025600000892155, 0.21514399998068257], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22121499995364502, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
19
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2143149999938032, "p50": 0.21809500003655558, "p90": 0.21866499997713618, "mean": 0.2174867999997332, "iqr": 0.0025609999738662736, "raw_times": [0.2161040000032699, 0.2202549999879011, 0.21866499997713618, 0.2143149999938032, 0.21809500003655558], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23317500000530345, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
20
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21576399996092732, "p50": 0.21857400003000294, "p90": 0.2222250000158965, "mean": 0.22089439999035676, "iqr": 0.004881000052137097, "raw_times": [0.21576399996092732, 0.23056499998119762, 0.21857400003000294, 0.2222250000158965, 0.21734399996375942], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22117399998933251, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
21
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.213884999993752, "p50": 0.21784400001934046, "p90": 0.21903500004327725, "mean": 0.2176128000087374, "iqr": 0.001270000041131425, "raw_times": [0.213884999993752, 0.21784400001934046, 0.21776500000214583, 0.21953499998517145, 0.21903500004327725], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22340499998563246, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
22
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21709399999281231, "p50": 0.2186049999863826, "p90": 0.21865499996920335, "mean": 0.22543699998323063, "iqr": 0.0004899999908047903, "raw_times": [0.21816499997839855, 0.25466599998935635, 0.21709399999281231, 0.2186049999863826, 0.21865499996920335], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22284399994987325, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
23
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2291549999995368, "p50": 0.23022499999569845, "p90": 0.2316450000421355, "mean": 0.234377000015229, "iqr": 0.0017800000478018774, "raw_times": [0.2291549999995368, 0.2316450000421355, 0.23022499999569845, 0.25099500004444053, 0.22986499999433363], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23142500003814348, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
24
+ {"ts": "2025-10-29T04:14:39Z", "run": "947975ffa01949789f79567552915539", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6352529999844592, "p50": 0.6405139999969833, "p90": 0.6429430000025604, "mean": 0.6394775999979174, "iqr": 0.007369000002199755, "raw_times": [0.6405139999969833, 0.6352529999844592, 0.6431040000052235, 0.6355740000003607, 0.6429430000025604], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.6388340000285098, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
rotary/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,35 +12,46 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- from kernels import get_kernel
17
 
18
- # Load the rotary kernel
19
- rotary = get_kernel("kernels-community/rotary")
20
 
 
 
 
 
 
 
 
 
 
21
 
22
- def hf_kernels_rotary(query, key, cos, sin, conj=False):
 
23
  rotary_dim = cos.shape[-1]
24
 
25
- # Clone to avoid modifying inputs
26
  q_out = query.clone()
27
  k_out = key.clone()
28
 
29
  # Apply rotation to query
30
  q1 = q_out[..., :rotary_dim]
31
  q2 = q_out[..., rotary_dim : 2 * rotary_dim]
32
- rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
 
 
33
 
34
  # Apply rotation to key
35
  k1 = k_out[..., :rotary_dim]
36
  k2 = k_out[..., rotary_dim : 2 * rotary_dim]
37
- rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
 
 
38
 
39
  return q_out, k_out
40
 
41
 
42
  run_benchmark(
43
  kernel_type=KernelTypeEnum.ROTARY,
44
- impl_name="hf_kernels_rotary",
45
- impl_tags={"family": "hf-kernels", "backend": "cuda"},
46
- impl_func=hf_kernels_rotary,
47
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
+ def apply_rotary_torch(x1, x2, cos, sin, conj=False):
18
+ """Reference rotary implementation."""
19
+ if not conj:
20
+ out1 = x1 * cos - x2 * sin
21
+ out2 = x1 * sin + x2 * cos
22
+ else:
23
+ out1 = x1 * cos + x2 * sin
24
+ out2 = -x1 * sin + x2 * cos
25
+ return out1, out2
26
 
27
+
28
+ def torch_rotary(query, key, cos, sin, conj=False):
29
  rotary_dim = cos.shape[-1]
30
 
31
+ # Clone inputs to avoid modifying them
32
  q_out = query.clone()
33
  k_out = key.clone()
34
 
35
  # Apply rotation to query
36
  q1 = q_out[..., :rotary_dim]
37
  q2 = q_out[..., rotary_dim : 2 * rotary_dim]
38
+ q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
39
+ q_out[..., :rotary_dim] = q_out_1
40
+ q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
41
 
42
  # Apply rotation to key
43
  k1 = k_out[..., :rotary_dim]
44
  k2 = k_out[..., rotary_dim : 2 * rotary_dim]
45
+ k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
46
+ k_out[..., :rotary_dim] = k_out_1
47
+ k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
48
 
49
  return q_out, k_out
50
 
51
 
52
  run_benchmark(
53
  kernel_type=KernelTypeEnum.ROTARY,
54
+ impl_name="torch_eager",
55
+ impl_tags={"family": "pytorch", "backend": "eager"},
56
+ impl_func=torch_rotary,
57
  )
rotary/impls/hf_kernels_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/impls/torch_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/results/artifacts/combine/latency.svg CHANGED
rotary/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-29T00:37:24.930217</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4216,70 +4216,70 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
- <path d="M 47.72 386.777328 L 823.142937 386.777328 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
- <use ns4:href="#m0fca2865ba" x="47.72" y="386.777328" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="390.576547" transform="rotate(-0 40.72 390.576547)">0.2</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
- <path d="M 47.72 308.488538 L 823.142937 308.488538 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
- <use ns4:href="#m0fca2865ba" x="47.72" y="308.488538" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="312.287756" transform="rotate(-0 40.72 312.287756)">0.3</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
- <path d="M 47.72 230.199747 L 823.142937 230.199747 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
- <use ns4:href="#m0fca2865ba" x="47.72" y="230.199747" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="233.998966" transform="rotate(-0 40.72 233.998966)">0.4</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
- <path d="M 47.72 151.910956 L 823.142937 151.910956 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
- <use ns4:href="#m0fca2865ba" x="47.72" y="151.910956" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="155.710175" transform="rotate(-0 40.72 155.710175)">0.5</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
- <path d="M 47.72 73.622166 L 823.142937 73.622166 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
- <use ns4:href="#m0fca2865ba" x="47.72" y="73.622166" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="77.421385" transform="rotate(-0 40.72 77.421385)">0.6</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
@@ -4287,34 +4287,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4287
  </g>
4288
  </g>
4289
  <g id="series--torch-eager" class="series">
4290
- <path d="M 82.966497 405.060892 L 113.615625 363.292257 L 144.264753 364.865861 L 174.913881 365.382567 L 205.563009 364.169091 L 236.212137 367.082217 L 266.861265 366.635971 L 297.510393 365.421712 L 328.159521 364.036783 L 358.808648 366.972613 L 389.457776 365.586118 L 420.106904 361.586344 L 450.756032 366.283671 L 481.40516 364.138559 L 512.054288 362.540684 L 542.703416 364.13073 L 573.352544 364.372642 L 604.001672 365.978345 L 634.6508 363.613241 L 665.299928 362.235358 L 695.949056 363.355671 L 726.598184 363.926396 L 757.247312 362.814695 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#p088c925177)">
4295
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4296
- <use ns4:href="#md7efaf3aec" x="113.615625" y="363.292257" style="fill: #1f77b4; stroke: #1f77b4" />
4297
- <use ns4:href="#md7efaf3aec" x="144.264753" y="364.865861" style="fill: #1f77b4; stroke: #1f77b4" />
4298
- <use ns4:href="#md7efaf3aec" x="174.913881" y="365.382567" style="fill: #1f77b4; stroke: #1f77b4" />
4299
- <use ns4:href="#md7efaf3aec" x="205.563009" y="364.169091" style="fill: #1f77b4; stroke: #1f77b4" />
4300
- <use ns4:href="#md7efaf3aec" x="236.212137" y="367.082217" style="fill: #1f77b4; stroke: #1f77b4" />
4301
- <use ns4:href="#md7efaf3aec" x="266.861265" y="366.635971" style="fill: #1f77b4; stroke: #1f77b4" />
4302
- <use ns4:href="#md7efaf3aec" x="297.510393" y="365.421712" style="fill: #1f77b4; stroke: #1f77b4" />
4303
- <use ns4:href="#md7efaf3aec" x="328.159521" y="364.036783" style="fill: #1f77b4; stroke: #1f77b4" />
4304
- <use ns4:href="#md7efaf3aec" x="358.808648" y="366.972613" style="fill: #1f77b4; stroke: #1f77b4" />
4305
- <use ns4:href="#md7efaf3aec" x="389.457776" y="365.586118" style="fill: #1f77b4; stroke: #1f77b4" />
4306
- <use ns4:href="#md7efaf3aec" x="420.106904" y="361.586344" style="fill: #1f77b4; stroke: #1f77b4" />
4307
- <use ns4:href="#md7efaf3aec" x="450.756032" y="366.283671" style="fill: #1f77b4; stroke: #1f77b4" />
4308
- <use ns4:href="#md7efaf3aec" x="481.40516" y="364.138559" style="fill: #1f77b4; stroke: #1f77b4" />
4309
- <use ns4:href="#md7efaf3aec" x="512.054288" y="362.540684" style="fill: #1f77b4; stroke: #1f77b4" />
4310
- <use ns4:href="#md7efaf3aec" x="542.703416" y="364.13073" style="fill: #1f77b4; stroke: #1f77b4" />
4311
- <use ns4:href="#md7efaf3aec" x="573.352544" y="364.372642" style="fill: #1f77b4; stroke: #1f77b4" />
4312
- <use ns4:href="#md7efaf3aec" x="604.001672" y="365.978345" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="634.6508" y="363.613241" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="665.299928" y="362.235358" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="695.949056" y="363.355671" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="726.598184" y="363.926396" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="757.247312" y="362.814695" style="fill: #1f77b4; stroke: #1f77b4" />
4318
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
@@ -4364,7 +4364,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4364
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4365
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4366
  </span> |
4367
- Cell: combine | 4.37s
4368
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4369
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4370
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4451,7 +4451,7 @@ Summary: 2 found, 0 skipped, 0 missing
4451
  COMBINED BENCHMARK SUMMARY
4452
 
4453
  impl wl p50(ms) ok
4454
- hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.09 False
4455
  hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.10 False
4456
  hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
4457
  hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
@@ -4462,7 +4462,7 @@ hf_kernels_rotary cuda_B1_S2048_H8_D64_R32 0.09 False
4462
  hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False
4463
  hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False
4464
  hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False
4465
- hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.09 False
4466
  hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False
4467
  hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False
4468
  hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False
@@ -4470,35 +4470,35 @@ hf_kernels_rotary cuda_B2_S128_H8_D64_R32 0.09 False
4470
  hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False
4471
  hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False
4472
  hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False
4473
- hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.09 False
4474
- hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.09 False
4475
- hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.10 False
4476
  hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False
4477
  hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
4478
- torch_eager cuda_B1_S128_H32_D128_R64 0.23 True
4479
- torch_eager cuda_B1_S128_H32_D64_R32 0.23 True
4480
- torch_eager cuda_B1_S128_H8_D128_R64 0.23 True
4481
  torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
4482
- torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True
4483
- torch_eager cuda_B1_S2048_H32_D64_R32 0.23 True
4484
- torch_eager cuda_B1_S2048_H8_D128_R64 0.23 True
4485
- torch_eager cuda_B1_S2048_H8_D64_R32 0.23 True
4486
- torch_eager cuda_B1_S512_H32_D128_R64 0.23 True
4487
- torch_eager cuda_B1_S512_H32_D64_R32 0.23 True
4488
- torch_eager cuda_B1_S512_H8_D128_R64 0.23 True
4489
- torch_eager cuda_B1_S512_H8_D64_R32 0.23 True
4490
- torch_eager cuda_B2_S128_H32_D128_R64 0.23 True
4491
- torch_eager cuda_B2_S128_H32_D64_R32 0.23 True
4492
- torch_eager cuda_B2_S128_H8_D128_R64 0.23 True
4493
- torch_eager cuda_B2_S128_H8_D64_R32 0.23 True
4494
  torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
4495
  torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
4496
- torch_eager cuda_B2_S2048_H8_D128_R64 0.23 True
4497
- torch_eager cuda_B2_S2048_H8_D64_R32 0.23 True
4498
- torch_eager cuda_B2_S512_H32_D128_R64 0.23 True
4499
- torch_eager cuda_B2_S512_H32_D64_R32 0.23 True
4500
- torch_eager cuda_B2_S512_H8_D128_R64 0.23 True
4501
- torch_eager cuda_B2_S512_H8_D64_R32 0.23 True
4502
 
4503
  GENERATING COMBINED VISUALIZATION
4504
 
@@ -4518,7 +4518,7 @@ Implementations included:
4518
  <div class="uv-install-logs" id="uv-logs-combine">
4519
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4520
  <div class="uv-logs-content" style="display: none;">
4521
- Installed 37 packages in 224ms
4522
  </div>
4523
  </div>
4524
  <div class="cell-artifacts">
@@ -4531,7 +4531,7 @@ Installed 37 packages in 224ms
4531
  <rdf:RDF>
4532
  <ns2:Work>
4533
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4534
- <dc:date>2025-10-29T00:37:24.930217</dc:date>
4535
  <dc:format>image/svg+xml</dc:format>
4536
  <dc:creator>
4537
  <ns2:Agent>
@@ -4875,70 +4875,70 @@ Installed 37 packages in 224ms
4875
  <g id="matplotlib.axis_2">
4876
  <g id="ytick_1">
4877
  <g id="grid-y--2" class="grid grid-y">
4878
- <path d="M 47.72 386.777328 L 823.142937 386.777328 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4879
  </g>
4880
  <g id="line2d_25">
4881
  <defs>
4882
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4883
  </defs>
4884
  <g>
4885
- <use ns4:href="#m0fca2865ba" x="47.72" y="386.777328" style="stroke: #000000; stroke-width: 0.8" />
4886
  </g>
4887
  </g>
4888
  <g id="text_25">
4889
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="390.576547" transform="rotate(-0 40.72 390.576547)">0.2</text>
4890
  </g>
4891
  </g>
4892
  <g id="ytick_2">
4893
  <g id="grid-y--3" class="grid grid-y">
4894
- <path d="M 47.72 308.488538 L 823.142937 308.488538 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4895
  </g>
4896
  <g id="line2d_26">
4897
  <g>
4898
- <use ns4:href="#m0fca2865ba" x="47.72" y="308.488538" style="stroke: #000000; stroke-width: 0.8" />
4899
  </g>
4900
  </g>
4901
  <g id="text_26">
4902
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="312.287756" transform="rotate(-0 40.72 312.287756)">0.3</text>
4903
  </g>
4904
  </g>
4905
  <g id="ytick_3">
4906
  <g id="grid-y--4" class="grid grid-y">
4907
- <path d="M 47.72 230.199747 L 823.142937 230.199747 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4908
  </g>
4909
  <g id="line2d_27">
4910
  <g>
4911
- <use ns4:href="#m0fca2865ba" x="47.72" y="230.199747" style="stroke: #000000; stroke-width: 0.8" />
4912
  </g>
4913
  </g>
4914
  <g id="text_27">
4915
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="233.998966" transform="rotate(-0 40.72 233.998966)">0.4</text>
4916
  </g>
4917
  </g>
4918
  <g id="ytick_4">
4919
  <g id="grid-y--5" class="grid grid-y">
4920
- <path d="M 47.72 151.910956 L 823.142937 151.910956 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4921
  </g>
4922
  <g id="line2d_28">
4923
  <g>
4924
- <use ns4:href="#m0fca2865ba" x="47.72" y="151.910956" style="stroke: #000000; stroke-width: 0.8" />
4925
  </g>
4926
  </g>
4927
  <g id="text_28">
4928
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="155.710175" transform="rotate(-0 40.72 155.710175)">0.5</text>
4929
  </g>
4930
  </g>
4931
  <g id="ytick_5">
4932
  <g id="grid-y--6" class="grid grid-y">
4933
- <path d="M 47.72 73.622166 L 823.142937 73.622166 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4934
  </g>
4935
  <g id="line2d_29">
4936
  <g>
4937
- <use ns4:href="#m0fca2865ba" x="47.72" y="73.622166" style="stroke: #000000; stroke-width: 0.8" />
4938
  </g>
4939
  </g>
4940
  <g id="text_29">
4941
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="77.421385" transform="rotate(-0 40.72 77.421385)">0.6</text>
4942
  </g>
4943
  </g>
4944
  <g id="label--y" class="ylabel">
@@ -4946,34 +4946,34 @@ Installed 37 packages in 224ms
4946
  </g>
4947
  </g>
4948
  <g id="series--torch-eager" class="series">
4949
- <path d="M 82.966497 405.060892 L 113.615625 363.292257 L 144.264753 364.865861 L 174.913881 365.382567 L 205.563009 364.169091 L 236.212137 367.082217 L 266.861265 366.635971 L 297.510393 365.421712 L 328.159521 364.036783 L 358.808648 366.972613 L 389.457776 365.586118 L 420.106904 361.586344 L 450.756032 366.283671 L 481.40516 364.138559 L 512.054288 362.540684 L 542.703416 364.13073 L 573.352544 364.372642 L 604.001672 365.978345 L 634.6508 363.613241 L 665.299928 362.235358 L 695.949056 363.355671 L 726.598184 363.926396 L 757.247312 362.814695 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4950
  <defs>
4951
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4952
  </defs>
4953
  <g clip-path="url(#p088c925177)">
4954
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4955
- <use ns4:href="#md7efaf3aec" x="113.615625" y="363.292257" style="fill: #1f77b4; stroke: #1f77b4" />
4956
- <use ns4:href="#md7efaf3aec" x="144.264753" y="364.865861" style="fill: #1f77b4; stroke: #1f77b4" />
4957
- <use ns4:href="#md7efaf3aec" x="174.913881" y="365.382567" style="fill: #1f77b4; stroke: #1f77b4" />
4958
- <use ns4:href="#md7efaf3aec" x="205.563009" y="364.169091" style="fill: #1f77b4; stroke: #1f77b4" />
4959
- <use ns4:href="#md7efaf3aec" x="236.212137" y="367.082217" style="fill: #1f77b4; stroke: #1f77b4" />
4960
- <use ns4:href="#md7efaf3aec" x="266.861265" y="366.635971" style="fill: #1f77b4; stroke: #1f77b4" />
4961
- <use ns4:href="#md7efaf3aec" x="297.510393" y="365.421712" style="fill: #1f77b4; stroke: #1f77b4" />
4962
- <use ns4:href="#md7efaf3aec" x="328.159521" y="364.036783" style="fill: #1f77b4; stroke: #1f77b4" />
4963
- <use ns4:href="#md7efaf3aec" x="358.808648" y="366.972613" style="fill: #1f77b4; stroke: #1f77b4" />
4964
- <use ns4:href="#md7efaf3aec" x="389.457776" y="365.586118" style="fill: #1f77b4; stroke: #1f77b4" />
4965
- <use ns4:href="#md7efaf3aec" x="420.106904" y="361.586344" style="fill: #1f77b4; stroke: #1f77b4" />
4966
- <use ns4:href="#md7efaf3aec" x="450.756032" y="366.283671" style="fill: #1f77b4; stroke: #1f77b4" />
4967
- <use ns4:href="#md7efaf3aec" x="481.40516" y="364.138559" style="fill: #1f77b4; stroke: #1f77b4" />
4968
- <use ns4:href="#md7efaf3aec" x="512.054288" y="362.540684" style="fill: #1f77b4; stroke: #1f77b4" />
4969
- <use ns4:href="#md7efaf3aec" x="542.703416" y="364.13073" style="fill: #1f77b4; stroke: #1f77b4" />
4970
- <use ns4:href="#md7efaf3aec" x="573.352544" y="364.372642" style="fill: #1f77b4; stroke: #1f77b4" />
4971
- <use ns4:href="#md7efaf3aec" x="604.001672" y="365.978345" style="fill: #1f77b4; stroke: #1f77b4" />
4972
- <use ns4:href="#md7efaf3aec" x="634.6508" y="363.613241" style="fill: #1f77b4; stroke: #1f77b4" />
4973
- <use ns4:href="#md7efaf3aec" x="665.299928" y="362.235358" style="fill: #1f77b4; stroke: #1f77b4" />
4974
- <use ns4:href="#md7efaf3aec" x="695.949056" y="363.355671" style="fill: #1f77b4; stroke: #1f77b4" />
4975
- <use ns4:href="#md7efaf3aec" x="726.598184" y="363.926396" style="fill: #1f77b4; stroke: #1f77b4" />
4976
- <use ns4:href="#md7efaf3aec" x="757.247312" y="362.814695" style="fill: #1f77b4; stroke: #1f77b4" />
4977
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4978
  </g>
4979
  </g>
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-29T04:15:02.721683</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
4216
  <g id="matplotlib.axis_2">
4217
  <g id="ytick_1">
4218
  <g id="grid-y--2" class="grid grid-y">
4219
+ <path d="M 47.72 387.258238 L 823.142937 387.258238 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4220
  </g>
4221
  <g id="line2d_25">
4222
  <defs>
4223
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4224
  </defs>
4225
  <g>
4226
+ <use ns4:href="#m0fca2865ba" x="47.72" y="387.258238" style="stroke: #000000; stroke-width: 0.8" />
4227
  </g>
4228
  </g>
4229
  <g id="text_25">
4230
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="391.057456" transform="rotate(-0 40.72 391.057456)">0.2</text>
4231
  </g>
4232
  </g>
4233
  <g id="ytick_2">
4234
  <g id="grid-y--3" class="grid grid-y">
4235
+ <path d="M 47.72 309.537751 L 823.142937 309.537751 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4236
  </g>
4237
  <g id="line2d_26">
4238
  <g>
4239
+ <use ns4:href="#m0fca2865ba" x="47.72" y="309.537751" style="stroke: #000000; stroke-width: 0.8" />
4240
  </g>
4241
  </g>
4242
  <g id="text_26">
4243
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="313.33697" transform="rotate(-0 40.72 313.33697)">0.3</text>
4244
  </g>
4245
  </g>
4246
  <g id="ytick_3">
4247
  <g id="grid-y--4" class="grid grid-y">
4248
+ <path d="M 47.72 231.817265 L 823.142937 231.817265 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4249
  </g>
4250
  <g id="line2d_27">
4251
  <g>
4252
+ <use ns4:href="#m0fca2865ba" x="47.72" y="231.817265" style="stroke: #000000; stroke-width: 0.8" />
4253
  </g>
4254
  </g>
4255
  <g id="text_27">
4256
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="235.616483" transform="rotate(-0 40.72 235.616483)">0.4</text>
4257
  </g>
4258
  </g>
4259
  <g id="ytick_4">
4260
  <g id="grid-y--5" class="grid grid-y">
4261
+ <path d="M 47.72 154.096778 L 823.142937 154.096778 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4262
  </g>
4263
  <g id="line2d_28">
4264
  <g>
4265
+ <use ns4:href="#m0fca2865ba" x="47.72" y="154.096778" style="stroke: #000000; stroke-width: 0.8" />
4266
  </g>
4267
  </g>
4268
  <g id="text_28">
4269
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="157.895997" transform="rotate(-0 40.72 157.895997)">0.5</text>
4270
  </g>
4271
  </g>
4272
  <g id="ytick_5">
4273
  <g id="grid-y--6" class="grid grid-y">
4274
+ <path d="M 47.72 76.376292 L 823.142937 76.376292 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4275
  </g>
4276
  <g id="line2d_29">
4277
  <g>
4278
+ <use ns4:href="#m0fca2865ba" x="47.72" y="76.376292" style="stroke: #000000; stroke-width: 0.8" />
4279
  </g>
4280
  </g>
4281
  <g id="text_29">
4282
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="80.175511" transform="rotate(-0 40.72 80.175511)">0.6</text>
4283
  </g>
4284
  </g>
4285
  <g id="label--y" class="ylabel">
 
4287
  </g>
4288
  </g>
4289
  <g id="series--torch-eager" class="series">
4290
+ <path d="M 82.966497 405.060892 L 113.615625 370.754292 L 144.264753 368.77242 L 174.913881 371.787975 L 205.563009 371.563363 L 236.212137 372.293158 L 266.861265 372.091085 L 297.510393 372.697305 L 328.159521 372.736942 L 358.808648 369.199883 L 389.457776 370.824241 L 420.106904 368.445994 L 450.756032 373.870884 L 481.40516 372.720621 L 512.054288 369.363096 L 542.703416 372.021136 L 573.352544 374.081506 L 604.001672 375.440838 L 634.6508 373.194716 L 665.299928 372.822434 L 695.949056 373.389794 L 726.598184 372.798341 L 757.247312 363.767221 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4291
  <defs>
4292
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4293
  </defs>
4294
  <g clip-path="url(#p088c925177)">
4295
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4296
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="370.754292" style="fill: #1f77b4; stroke: #1f77b4" />
4297
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="368.77242" style="fill: #1f77b4; stroke: #1f77b4" />
4298
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="371.787975" style="fill: #1f77b4; stroke: #1f77b4" />
4299
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="371.563363" style="fill: #1f77b4; stroke: #1f77b4" />
4300
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="372.293158" style="fill: #1f77b4; stroke: #1f77b4" />
4301
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="372.091085" style="fill: #1f77b4; stroke: #1f77b4" />
4302
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="372.697305" style="fill: #1f77b4; stroke: #1f77b4" />
4303
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="372.736942" style="fill: #1f77b4; stroke: #1f77b4" />
4304
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="369.199883" style="fill: #1f77b4; stroke: #1f77b4" />
4305
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="370.824241" style="fill: #1f77b4; stroke: #1f77b4" />
4306
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="368.445994" style="fill: #1f77b4; stroke: #1f77b4" />
4307
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="373.870884" style="fill: #1f77b4; stroke: #1f77b4" />
4308
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="372.720621" style="fill: #1f77b4; stroke: #1f77b4" />
4309
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="369.363096" style="fill: #1f77b4; stroke: #1f77b4" />
4310
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="372.021136" style="fill: #1f77b4; stroke: #1f77b4" />
4311
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="374.081506" style="fill: #1f77b4; stroke: #1f77b4" />
4312
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="375.440838" style="fill: #1f77b4; stroke: #1f77b4" />
4313
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="373.194716" style="fill: #1f77b4; stroke: #1f77b4" />
4314
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="372.822434" style="fill: #1f77b4; stroke: #1f77b4" />
4315
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="373.389794" style="fill: #1f77b4; stroke: #1f77b4" />
4316
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="372.798341" style="fill: #1f77b4; stroke: #1f77b4" />
4317
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="363.767221" style="fill: #1f77b4; stroke: #1f77b4" />
4318
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4319
  </g>
4320
  </g>
 
4364
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4365
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4366
  </span> |
4367
+ Cell: combine | 4.35s
4368
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4369
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4370
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4451
  COMBINED BENCHMARK SUMMARY
4452
 
4453
  impl wl p50(ms) ok
4454
+ hf_kernels_rotary cuda_B1_S128_H32_D128_R64 0.10 False
4455
  hf_kernels_rotary cuda_B1_S128_H32_D64_R32 0.10 False
4456
  hf_kernels_rotary cuda_B1_S128_H8_D128_R64 0.10 False
4457
  hf_kernels_rotary cuda_B1_S128_H8_D64_R32 0.08 False
 
4462
  hf_kernels_rotary cuda_B1_S512_H32_D128_R64 0.09 False
4463
  hf_kernels_rotary cuda_B1_S512_H32_D64_R32 0.09 False
4464
  hf_kernels_rotary cuda_B1_S512_H8_D128_R64 0.09 False
4465
+ hf_kernels_rotary cuda_B1_S512_H8_D64_R32 0.10 False
4466
  hf_kernels_rotary cuda_B2_S128_H32_D128_R64 0.09 False
4467
  hf_kernels_rotary cuda_B2_S128_H32_D64_R32 0.09 False
4468
  hf_kernels_rotary cuda_B2_S128_H8_D128_R64 0.09 False
 
4470
  hf_kernels_rotary cuda_B2_S2048_H32_D128_R64 0.28 False
4471
  hf_kernels_rotary cuda_B2_S2048_H32_D64_R32 0.10 False
4472
  hf_kernels_rotary cuda_B2_S2048_H8_D128_R64 0.09 False
4473
+ hf_kernels_rotary cuda_B2_S2048_H8_D64_R32 0.10 False
4474
+ hf_kernels_rotary cuda_B2_S512_H32_D128_R64 0.10 False
4475
+ hf_kernels_rotary cuda_B2_S512_H32_D64_R32 0.09 False
4476
  hf_kernels_rotary cuda_B2_S512_H8_D128_R64 0.09 False
4477
  hf_kernels_rotary cuda_B2_S512_H8_D64_R32 0.09 False
4478
+ torch_eager cuda_B1_S128_H32_D128_R64 0.22 True
4479
+ torch_eager cuda_B1_S128_H32_D64_R32 0.22 True
4480
+ torch_eager cuda_B1_S128_H8_D128_R64 0.22 True
4481
  torch_eager cuda_B1_S128_H8_D64_R32 0.18 True
4482
+ torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True
4483
+ torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True
4484
+ torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True
4485
+ torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True
4486
+ torch_eager cuda_B1_S512_H32_D128_R64 0.22 True
4487
+ torch_eager cuda_B1_S512_H32_D64_R32 0.22 True
4488
+ torch_eager cuda_B1_S512_H8_D128_R64 0.22 True
4489
+ torch_eager cuda_B1_S512_H8_D64_R32 0.22 True
4490
+ torch_eager cuda_B2_S128_H32_D128_R64 0.22 True
4491
+ torch_eager cuda_B2_S128_H32_D64_R32 0.22 True
4492
+ torch_eager cuda_B2_S128_H8_D128_R64 0.22 True
4493
+ torch_eager cuda_B2_S128_H8_D64_R32 0.22 True
4494
  torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True
4495
  torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True
4496
+ torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True
4497
+ torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True
4498
+ torch_eager cuda_B2_S512_H32_D128_R64 0.22 True
4499
+ torch_eager cuda_B2_S512_H32_D64_R32 0.22 True
4500
+ torch_eager cuda_B2_S512_H8_D128_R64 0.22 True
4501
+ torch_eager cuda_B2_S512_H8_D64_R32 0.22 True
4502
 
4503
  GENERATING COMBINED VISUALIZATION
4504
 
 
4518
  <div class="uv-install-logs" id="uv-logs-combine">
4519
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4520
  <div class="uv-logs-content" style="display: none;">
4521
+ Installed 37 packages in 196ms
4522
  </div>
4523
  </div>
4524
  <div class="cell-artifacts">
 
4531
  <rdf:RDF>
4532
  <ns2:Work>
4533
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4534
+ <dc:date>2025-10-29T04:15:02.721683</dc:date>
4535
  <dc:format>image/svg+xml</dc:format>
4536
  <dc:creator>
4537
  <ns2:Agent>
 
4875
  <g id="matplotlib.axis_2">
4876
  <g id="ytick_1">
4877
  <g id="grid-y--2" class="grid grid-y">
4878
+ <path d="M 47.72 387.258238 L 823.142937 387.258238 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4879
  </g>
4880
  <g id="line2d_25">
4881
  <defs>
4882
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4883
  </defs>
4884
  <g>
4885
+ <use ns4:href="#m0fca2865ba" x="47.72" y="387.258238" style="stroke: #000000; stroke-width: 0.8" />
4886
  </g>
4887
  </g>
4888
  <g id="text_25">
4889
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="391.057456" transform="rotate(-0 40.72 391.057456)">0.2</text>
4890
  </g>
4891
  </g>
4892
  <g id="ytick_2">
4893
  <g id="grid-y--3" class="grid grid-y">
4894
+ <path d="M 47.72 309.537751 L 823.142937 309.537751 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4895
  </g>
4896
  <g id="line2d_26">
4897
  <g>
4898
+ <use ns4:href="#m0fca2865ba" x="47.72" y="309.537751" style="stroke: #000000; stroke-width: 0.8" />
4899
  </g>
4900
  </g>
4901
  <g id="text_26">
4902
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="313.33697" transform="rotate(-0 40.72 313.33697)">0.3</text>
4903
  </g>
4904
  </g>
4905
  <g id="ytick_3">
4906
  <g id="grid-y--4" class="grid grid-y">
4907
+ <path d="M 47.72 231.817265 L 823.142937 231.817265 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4908
  </g>
4909
  <g id="line2d_27">
4910
  <g>
4911
+ <use ns4:href="#m0fca2865ba" x="47.72" y="231.817265" style="stroke: #000000; stroke-width: 0.8" />
4912
  </g>
4913
  </g>
4914
  <g id="text_27">
4915
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="235.616483" transform="rotate(-0 40.72 235.616483)">0.4</text>
4916
  </g>
4917
  </g>
4918
  <g id="ytick_4">
4919
  <g id="grid-y--5" class="grid grid-y">
4920
+ <path d="M 47.72 154.096778 L 823.142937 154.096778 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4921
  </g>
4922
  <g id="line2d_28">
4923
  <g>
4924
+ <use ns4:href="#m0fca2865ba" x="47.72" y="154.096778" style="stroke: #000000; stroke-width: 0.8" />
4925
  </g>
4926
  </g>
4927
  <g id="text_28">
4928
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="157.895997" transform="rotate(-0 40.72 157.895997)">0.5</text>
4929
  </g>
4930
  </g>
4931
  <g id="ytick_5">
4932
  <g id="grid-y--6" class="grid grid-y">
4933
+ <path d="M 47.72 76.376292 L 823.142937 76.376292 " clip-path="url(#p088c925177)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4934
  </g>
4935
  <g id="line2d_29">
4936
  <g>
4937
+ <use ns4:href="#m0fca2865ba" x="47.72" y="76.376292" style="stroke: #000000; stroke-width: 0.8" />
4938
  </g>
4939
  </g>
4940
  <g id="text_29">
4941
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="80.175511" transform="rotate(-0 40.72 80.175511)">0.6</text>
4942
  </g>
4943
  </g>
4944
  <g id="label--y" class="ylabel">
 
4946
  </g>
4947
  </g>
4948
  <g id="series--torch-eager" class="series">
4949
+ <path d="M 82.966497 405.060892 L 113.615625 370.754292 L 144.264753 368.77242 L 174.913881 371.787975 L 205.563009 371.563363 L 236.212137 372.293158 L 266.861265 372.091085 L 297.510393 372.697305 L 328.159521 372.736942 L 358.808648 369.199883 L 389.457776 370.824241 L 420.106904 368.445994 L 450.756032 373.870884 L 481.40516 372.720621 L 512.054288 369.363096 L 542.703416 372.021136 L 573.352544 374.081506 L 604.001672 375.440838 L 634.6508 373.194716 L 665.299928 372.822434 L 695.949056 373.389794 L 726.598184 372.798341 L 757.247312 363.767221 L 787.896439 44.888614 " clip-path="url(#p088c925177)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4950
  <defs>
4951
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4952
  </defs>
4953
  <g clip-path="url(#p088c925177)">
4954
  <use ns4:href="#md7efaf3aec" x="82.966497" y="405.060892" style="fill: #1f77b4; stroke: #1f77b4" />
4955
+ <use ns4:href="#md7efaf3aec" x="113.615625" y="370.754292" style="fill: #1f77b4; stroke: #1f77b4" />
4956
+ <use ns4:href="#md7efaf3aec" x="144.264753" y="368.77242" style="fill: #1f77b4; stroke: #1f77b4" />
4957
+ <use ns4:href="#md7efaf3aec" x="174.913881" y="371.787975" style="fill: #1f77b4; stroke: #1f77b4" />
4958
+ <use ns4:href="#md7efaf3aec" x="205.563009" y="371.563363" style="fill: #1f77b4; stroke: #1f77b4" />
4959
+ <use ns4:href="#md7efaf3aec" x="236.212137" y="372.293158" style="fill: #1f77b4; stroke: #1f77b4" />
4960
+ <use ns4:href="#md7efaf3aec" x="266.861265" y="372.091085" style="fill: #1f77b4; stroke: #1f77b4" />
4961
+ <use ns4:href="#md7efaf3aec" x="297.510393" y="372.697305" style="fill: #1f77b4; stroke: #1f77b4" />
4962
+ <use ns4:href="#md7efaf3aec" x="328.159521" y="372.736942" style="fill: #1f77b4; stroke: #1f77b4" />
4963
+ <use ns4:href="#md7efaf3aec" x="358.808648" y="369.199883" style="fill: #1f77b4; stroke: #1f77b4" />
4964
+ <use ns4:href="#md7efaf3aec" x="389.457776" y="370.824241" style="fill: #1f77b4; stroke: #1f77b4" />
4965
+ <use ns4:href="#md7efaf3aec" x="420.106904" y="368.445994" style="fill: #1f77b4; stroke: #1f77b4" />
4966
+ <use ns4:href="#md7efaf3aec" x="450.756032" y="373.870884" style="fill: #1f77b4; stroke: #1f77b4" />
4967
+ <use ns4:href="#md7efaf3aec" x="481.40516" y="372.720621" style="fill: #1f77b4; stroke: #1f77b4" />
4968
+ <use ns4:href="#md7efaf3aec" x="512.054288" y="369.363096" style="fill: #1f77b4; stroke: #1f77b4" />
4969
+ <use ns4:href="#md7efaf3aec" x="542.703416" y="372.021136" style="fill: #1f77b4; stroke: #1f77b4" />
4970
+ <use ns4:href="#md7efaf3aec" x="573.352544" y="374.081506" style="fill: #1f77b4; stroke: #1f77b4" />
4971
+ <use ns4:href="#md7efaf3aec" x="604.001672" y="375.440838" style="fill: #1f77b4; stroke: #1f77b4" />
4972
+ <use ns4:href="#md7efaf3aec" x="634.6508" y="373.194716" style="fill: #1f77b4; stroke: #1f77b4" />
4973
+ <use ns4:href="#md7efaf3aec" x="665.299928" y="372.822434" style="fill: #1f77b4; stroke: #1f77b4" />
4974
+ <use ns4:href="#md7efaf3aec" x="695.949056" y="373.389794" style="fill: #1f77b4; stroke: #1f77b4" />
4975
+ <use ns4:href="#md7efaf3aec" x="726.598184" y="372.798341" style="fill: #1f77b4; stroke: #1f77b4" />
4976
+ <use ns4:href="#md7efaf3aec" x="757.247312" y="363.767221" style="fill: #1f77b4; stroke: #1f77b4" />
4977
  <use ns4:href="#md7efaf3aec" x="787.896439" y="44.888614" style="fill: #1f77b4; stroke: #1f77b4" />
4978
  </g>
4979
  </g>